In [1]:
from pathlib import Path
from modules import run_pipeline
from modules.utilities import normalize_path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_file = normalize_path(Path('./input/audio/test1.mp3'))
DEFAULT_OUTPUT_DIR = normalize_path(Path("./output"))

print("Input File: ", input_file, type(input_file))
print("Output Directory: ", DEFAULT_OUTPUT_DIR, type(DEFAULT_OUTPUT_DIR))

Input File:  C:\Users\chris\ai_karaoke_app\input\audio\test1.mp3 <class 'str'>
Output Directory:  C:\Users\chris\ai_karaoke_app\output <class 'str'>


In [3]:
run_pipeline(
    input_file,
    DEFAULT_OUTPUT_DIR,
    override_all=False,
    verbose=True,
)

[37m2025-01-23 09:35 - [modules.audio_pre_processing.utilities] - DEBUG - File hash: acb59db18c5612564488035a65131a78ef6461a4a49555dd59f78f131a01fc21[0m
[34m2025-01-23 09:35 - [modules.audio_pre_processing.process] - INFO - Directory: C:\Users\chris\ai_karaoke_app\output\acb59db18c5612564488035a65131a78ef6461a4a49555dd59f78f131a01fc21[0m
[34m2025-01-23 09:35 - [modules.audio_pre_processing.process] - INFO - Skipping audio pre-processing... Audio metadata already exist in the output directory...[0m
[34m2025-01-23 09:35 - [modules.stem_processing.stem_separation.process] - INFO - Skipping stem separation... Audio stems already exist in the output directory...[0m
[34m2025-01-23 09:35 - [modules.stem_processing.stem_merging.process] - INFO - Skipping audio merging... Instrumental audio already exists in the output directory...[0m
[34m2025-01-23 09:35 - [modules.lyrics_processing.extract_lyrics.process] - INFO - Skipping lyric extraction... Lyrics raw data already exist in the ou

ALIGNED LYRICS AFTER AI PROCESSING:
[WordAlignment(word='I', start=31.04, end=32.08),
 WordAlignment(word='knock', start=32.08, end=32.38),
 WordAlignment(word='on', start=32.38, end=32.68),
 WordAlignment(word='the', start=32.68, end=32.88),
 WordAlignment(word='door', start=32.88, end=33.3),
 WordAlignment(word='I', start=33.94, end=34.4),
 WordAlignment(word='tap', start=33.94, end=34.4),
 WordAlignment(word='on', start=34.4, end=34.68),
 WordAlignment(word='the', start=34.68, end=34.88),
 WordAlignment(word='floor', start=34.88, end=35.24),
 WordAlignment(word="It's", start=35.24, end=36.16),
 WordAlignment(word='gloomy', start=36.16, end=36.8),
 WordAlignment(word='in', start=36.8, end=37.22),
 WordAlignment(word='my', start=37.22, end=37.66),
 WordAlignment(word='mind', start=37.66, end=38.22),
 WordAlignment(word='I', start=39.82, end=40.36),
 WordAlignment(word='scratch', start=39.82, end=40.36),
 WordAlignment(word='on', start=40.36, end=40.7),
 WordAlignment(word='the', start

In [4]:
import torch
import subprocess
from pathlib import Path
from PIL import Image
import subprocess

def extract_audio_duration(audio_path):
    """
    Get the duration of an audio file using ffprobe.
    """
    try:
        # Run the ffprobe command to extract the duration of the audio file
        result = subprocess.run(
            [
                "ffprobe",                              # Command to run ffprobe
                "-i", audio_path,                       # Input file path
                "-show_entries", "format=duration",     # Request only the duration metadata
                "-v", "quiet",                          # Suppress unnecessary output
                "-of", "csv=p=0"                        # Format the output as plain CSV with no headers
            ],
            stdout=subprocess.PIPE,                     # Capture the standard output
            stderr=subprocess.PIPE,                     # Capture the standard error
            check=True                                  # Raise CalledProcessError if the command fails
        )

        # Decode the stdout result to a string, strip whitespace, and convert to float
        duration = float(result.stdout.decode("utf-8").strip())
        return duration  # Return the duration in seconds

    except subprocess.CalledProcessError as e:
        # Handle errors from ffprobe (e.g., if the command fails or the file is invalid)
        raise RuntimeError(f"ffprobe failed: {e.stderr.decode('utf-8')}") from e

    except ValueError as e:
        # Handle cases where the output cannot be converted to a float
        raise ValueError("Invalid duration format received from ffprobe.") from e

    except Exception as e:
        # Handle any other unexpected errors
        raise RuntimeError(f"Unexpected error: {e}") from e

# def preprocess_image(image_path, resolution):
#     output_path = str(Path("./audio_processing/karaoke_files/preprocessed_images/temp_image.png"))
#     try:
#         img = Image.open(image_path)
#         # Ensure RGB format
#         img = img.convert("RGB")  
#         width, height = map(int, resolution.split("x"))
#         # Match video resolution
#         img = img.resize((width, height))  
#         img.save(output_path, "PNG")
#         return output_path
#     except Exception as e:
#         print(f"Error processing image: {e}")
#         return None

def generate_karaoke_video(
    audio_path,
    ass_path,
    output_path,
    resolution="1280x720",
    preset="fast",
    crf=23,
    fps=24,
    bitrate="3000k",
    audio_bitrate="192k",
    background_image=False, 
):
    """
    Generate a karaoke video with a black background, utilizing GPU acceleration if available.

    Parameters:
    - audio_path (str): Path to the input audio file.
    - ass_path (str): Path to the ASS subtitle file.
    - output_path (str): Path to save the generated video.
    - resolution (str): Video resolution (default: "1280x720").
    - preset (str): FFmpeg encoding preset for speed/quality tradeoff (default: "fast").
    - crf (int): Quality setting for video encoding (lower is better, default: 23).
    - fps (int): Frames per second for the video (default: 24).
    - bitrate (str): Video bitrate for quality control (default: "3000k").
    - audio_bitrate (str): Audio bitrate for output quality (default: "192k").
    """

    # Check for GPU availability
    if torch.cuda.is_available():
        # Use NVIDIA NVENC for GPU acceleration
        device = torch.cuda.get_device_name(0)
        print(f"✅ GPU detected: {device}")
        video_codec = "h264_nvenc"  
    else:
        # Use CPU codec
        print("⚠️ No GPU detected. Falling back to CPU.")
        video_codec = "libx264"  

    # Get audio duration
    audio_duration = extract_audio_duration(audio_path)
    if audio_duration is None:
        print("❌ Unable to retrieve audio duration. Aborting.")
        return

    # if background_image:
        # background_image = preprocess_image(background_image, resolution)
        # if not background_image:
        #     return "Error processing background image."

# Build FFmpeg command
    command = ["ffmpeg", "-y"]  # Overwrite output

    if background_image:
        # Add background image
        command.extend(["-loop", "1", "-i", background_image])  # Loop the background image
        # Add audio input
        command.extend(["-i", audio_path])
        # Filter complex for scaling and subtitles
        filter_complex = f"[0:v]scale={resolution},subtitles={ass_path}"
        command.extend(["-filter_complex", filter_complex])
        # Map video and audio streams
        command.extend(["-map", "0:v", "-map", "1:a"])
    else:
        # Add a black background
        command.extend(["-f", "lavfi", "-i", f"color=c=black:s={resolution}:d={audio_duration}"])  # Black background
        # Add audio input
        command.extend(["-i", audio_path])
        # Add subtitles directly
        command.extend(["-vf", f"subtitles={ass_path}"])

    # Add common video and audio options
    command.extend([
        "-pix_fmt", "yuv420p",  # Set standard pixel format
        "-c:v", video_codec,  # Video codec
        "-preset", preset,    # Encoding preset
        "-crf", str(crf),     # Quality level
        "-r", str(fps),       # Frame rate
        "-b:v", bitrate,      # Video bitrate
        "-c:a", "aac",        # Audio codec
        "-b:a", audio_bitrate,  # Audio bitrate
        "-shortest",  # Match shortest stream
        output_path  # Output file
    ])

    # Debugging: Print the constructed command
    print("\nRunning FFmpeg command:")
    print(" ".join(command))

    # Execute FFmpeg command
    try:
        subprocess.run(command, check=True)
        print(f"✅ Video successfully created at: {output_path}")
    except subprocess.CalledProcessError as e:
        print(f"❌ FFmpeg error: {e}")
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")

In [None]:
audio_path = r'C:\Users\chris\ai_karaoke_app\output\acb59db18c5612564488035a65131a78ef6461a4a49555dd59f78f131a01fc21\instrumental.mp3'
ass_path = r'C:\Users\chris\ai_karaoke_app\output\acb59db18c5612564488035a65131a78ef6461a4a49555dd59f78f131a01fc21\karaoke_subtitles.ass'
out_path = r'C:\Users\chris\ai_karaoke_app\output\acb59db18c5612564488035a65131a78ef6461a4a49555dd59f78f131a01fc21\test.mp4'

audio_path, ass_path, out_path = str(Path(audio_path).resolve()).replace("\\", "/"), str(Path(ass_path).resolve()).replace("\\", "/"), str(Path(out_path).resolve()).replace("\\", "/") 
generate_karaoke_video(audio_path, ass_path, out_path)