<a href="https://colab.research.google.com/github/ras0k/list2mp3/blob/main/list2mp3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title List to mp3
!pip install -U yt-dlp --quiet
!apt install ffmpeg -y

import subprocess
import os
import json
import glob
import ipywidgets as widgets
from IPython.display import display
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, parse_qs

# Clear temporary files from previous runs
def clear_folder():
    patterns = ["*.mp3", "file_list.txt", "cover*", "cropped_cover*", "thumb_cover_*"]
    for pattern in patterns:
        for file in glob.glob(pattern):
            try:
                os.remove(file)
            except Exception as e:
                print(f"Could not remove {file}: {e}")

clear_folder()

# Function to transform a URL like:
# https://www.youtube.com/watch?v=...&list=...
# into a proper playlist URL:
# https://www.youtube.com/playlist?list=...
def transform_playlist_url(url):
    if "list=" in url and "playlist?" not in url:
        parsed = urlparse(url)
        query_params = parse_qs(parsed.query)
        if "list" in query_params:
            list_id = query_params["list"][0]
            new_url = f"https://www.youtube.com/playlist?list={list_id}"
            return new_url
    return url

# Function to get playlist name using BeautifulSoup
def get_playlist_title_bs(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print("Failed to retrieve the playlist page.")
        return "merged_audio"
    soup = BeautifulSoup(response.text, "html.parser")
    title_tag = soup.find("title")
    if not title_tag:
        return "merged_audio"
    title = title_tag.text
    title = re.sub(r"\s*-\s*YouTube\s*$", "", title)
    title = re.sub(r"[\"'«»“”‘’]", " ", title)
    title = re.sub(r"[\\/*?:<>|]", "", title)
    title = title.replace("//", "-")
    title = re.sub(r"\s+", " ", title)
    return title.strip()

def get_playlist_name(url):
    title = get_playlist_title_bs(url)
    print("Playlist title (sanitized):", title)
    return title

# Extract metadata from the first video.
# If the URL is a playlist, use --playlist-items 1 to get only the first video's metadata.
def get_video_metadata(url):
    try:
        url = transform_playlist_url(url)
        if "playlist?list=" in url:
            cmd = f"yt-dlp -j --playlist-items 1 {url}"
        else:
            cmd = f"yt-dlp -j {url}"
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        if result.returncode != 0:
            print("Error fetching video metadata.")
            return {}
        info = json.loads(result.stdout)
        return {
            "channel": info.get("uploader", "Unknown Artist"),
            "thumbnail": info.get("thumbnail", ""),
            "title": info.get("title", "")
        }
    except Exception as e:
        print("Exception while fetching video metadata:", e)
        return {}

# Download thumbnail image from URL
def download_thumbnail(url, filename="cover.jpg"):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)
            return True
        else:
            print("Failed to download thumbnail.")
            return False
    except Exception as e:
        print("Exception while downloading thumbnail:", e)
        return False

# Crop the thumbnail to a centered square using ffmpeg.
def crop_thumbnail(input_file, output_file):
    crop_cmd = f'ffmpeg -y -i {input_file} -vf "crop=min(iw\\,ih):min(iw\\,ih):(iw-min(iw\\,ih))/2:(ih-min(iw\\,ih))/2" {output_file}'
    result = subprocess.run(crop_cmd, shell=True, capture_output=True, text=True)
    if result.returncode == 0:
        print("Thumbnail cropped to square successfully.")
        return True
    else:
        print("Thumbnail cropping failed:", result.stderr)
        return False

# Determine cover art from playlist by checking for a thumbnail that appears 3 times.
# If found, return its filename; otherwise, return the default cover filename.
def determine_cover_art(playlist_url, default_cover_filename):
    print("Checking for duplicate thumbnails in playlist to determine cover art...")
    try:
        result = subprocess.run(["yt-dlp", "-J", playlist_url], capture_output=True, text=True, check=True)
        playlist_info = json.loads(result.stdout)
    except Exception as e:
        print("Error fetching playlist info for cover art determination:", e)
        return default_cover_filename

    entries = playlist_info.get("entries", [])
    print(f"[Cover Art] Found {len(entries)} videos in the playlist for cover art determination.\n")

    reference_thumbs = []  # list of dicts: {"file": filename, "count": count}

    for idx, entry in enumerate(entries, start=1):
        if entry is None:
            print(f"[Cover Art] Skipping entry {idx} because it's None")
            continue
        thumbnail_url = entry.get("thumbnail", "")
        if not thumbnail_url:
            print(f"[Cover Art] No thumbnail URL for entry {idx}, skipping.")
            continue
        thumb_filename = f"thumb_cover_{idx}.jpg"
        try:
            response = requests.get(thumbnail_url)
            if response.status_code != 200:
                print(f"[Cover Art] Failed to download thumbnail for entry {idx}.")
                continue
            with open(thumb_filename, "wb") as f:
                f.write(response.content)
            print(f"[Cover Art] Downloaded thumbnail to {thumb_filename}")
        except Exception as e:
            print(f"[Cover Art] Error downloading thumbnail for entry {idx}: {e}")
            continue

        duplicate_found = False
        for ref in reference_thumbs:
            cmp_result = subprocess.run(["cmp", "-s", ref["file"], thumb_filename])
            if cmp_result.returncode == 0:
                duplicate_found = True
                ref["count"] += 1
                print(f"[Cover Art] Found duplicate for {ref['file']}. Count: {ref['count']}")
                if ref["count"] == 3:
                    print("[Cover Art] Duplicate cover art found 3 times!")
                    return ref["file"]
                break
        if not duplicate_found:
            reference_thumbs.append({"file": thumb_filename, "count": 1})
            print(f"[Cover Art] New thumbnail reference: {thumb_filename} count 1")
    print("[Cover Art] No duplicate thumbnail found 3 times, using default cover art.")
    return default_cover_filename

# Download audio, merge into a single MP3, and integrate metadata (with cropped cover art)
def list_to_mp3(urls):
    print("Splitting input URLs...")
    url_list = [transform_playlist_url(url.strip()) for url in urls.splitlines() if url.strip()]

    # Determine output name: if a single playlist URL is provided, use its title
    if len(url_list) == 1 and "playlist?list=" in url_list[0]:
        output_name = get_playlist_name(url_list[0])
    else:
        output_name = "merged_audio"
    print("Output file will be:", output_name + ".mp3")

    # Download audio tracks for each URL
    for idx, url in enumerate(url_list, start=1):
        print(f"Downloading audio for URL {idx}/{len(url_list)}: {url}")
        result = subprocess.run(
            f"yt-dlp --restrict-filenames -x --audio-format mp3 -o '%(playlist_index)s_%(title)s.%(ext)s' {url}",
            shell=True,
            capture_output=True,
            text=True
        )
        if result.returncode != 0:
            print(f"Error downloading {url}:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}")
            raise Exception("yt-dlp command failed")
        else:
            print(f"Successfully downloaded audio for URL {idx}/{len(url_list)}")

    # Create file list for merging with ffmpeg using absolute paths
    print("Creating file list for merging...")
    file_list_path = "file_list.txt"
    with open(file_list_path, "w") as f:
        for file in sorted(os.listdir()):
            if file.endswith(".mp3"):
                filepath = os.path.abspath(file)
                f.write(f"file '{filepath}'\n")

    # Merge audio files into a single MP3 file using ffmpeg
    ffmpeg_cmd = f"ffmpeg -f concat -safe 0 -i {file_list_path} -c copy '{output_name}.mp3'"
    print("The ffmpeg merge command to be executed is:")
    print(ffmpeg_cmd)
    print("Merging downloaded audio files into a single MP3...")
    merge_result = subprocess.run(ffmpeg_cmd, shell=True, capture_output=True, text=True)
    if merge_result.returncode != 0:
        print(f"Error merging files:\nSTDOUT:\n{merge_result.stdout}\nSTDERR:\n{merge_result.stderr}")
        raise Exception("ffmpeg merge command failed")
    print("Merge completed successfully.")

    # Integrate metadata using ffmpeg
    print("Integrating metadata into the merged MP3...")
    video_metadata = get_video_metadata(url_list[0])
    channel_name = video_metadata.get("channel", "Unknown Artist")
    if channel_name.endswith(" - Topic"):
        channel_name = channel_name[:-8]  # Remove the trailing " - Topic"
    thumbnail_url = video_metadata.get("thumbnail", "")
    default_cover = "cover.jpg"
    cover_filename = None
    if thumbnail_url:
        if download_thumbnail(thumbnail_url, default_cover):
            print("Default thumbnail downloaded successfully.")
            cover_filename = default_cover
        else:
            print("Default thumbnail download failed. Proceeding without cover art.")
    else:
        print("No thumbnail URL found in first video. Proceeding without cover art.")

    # If a playlist and we have a default cover, check for duplicate thumbnails
    if cover_filename and len(url_list) == 1 and "playlist?list=" in url_list[0]:
        determined_cover = determine_cover_art(url_list[0], cover_filename)
        if determined_cover != cover_filename:
            print(f"Using duplicate cover art from file: {determined_cover}")
            cover_filename = determined_cover
        else:
            print("No duplicate cover art found, keeping default cover art.")

    if cover_filename:
        cropped_cover = "cropped_" + cover_filename
        if crop_thumbnail(cover_filename, cropped_cover):
            cover_filename = cropped_cover
        else:
            print("Proceeding with the original thumbnail.")

    if cover_filename:
        ffmpeg_meta_cmd = (
            f'ffmpeg -i "{output_name}.mp3" -i {cover_filename} -map 0 -map 1 '
            f'-c copy -id3v2_version 3 -metadata title="{output_name}" '
            f'-metadata artist="{channel_name}" -metadata:s:v title="Album cover" '
            f'-metadata:s:v comment="Cover (front)" "temp_{output_name}.mp3"'
        )
    else:
        ffmpeg_meta_cmd = (
            f'ffmpeg -i "{output_name}.mp3" -c copy -id3v2_version 3 '
            f'-metadata title="{output_name}" -metadata artist="{channel_name}" '
            f'"temp_{output_name}.mp3"'
        )
    print("The metadata embedding ffmpeg command to be executed is:")
    print(ffmpeg_meta_cmd)
    meta_result = subprocess.run(ffmpeg_meta_cmd, shell=True, capture_output=True, text=True)
    if meta_result.returncode != 0:
        print(f"Error embedding metadata:\nSTDOUT:\n{meta_result.stdout}\nSTDERR:\n{meta_result.stderr}")
        raise Exception("ffmpeg metadata embedding command failed")
    os.replace(f"temp_{output_name}.mp3", f"{output_name}.mp3")
    print("Metadata integrated successfully into the MP3 file.")

    # --- Convert the MP3 to OPUS and add chapters ---
    try:
        chapter_files = sorted(
            f for f in os.listdir() if re.match(r'\d+_.*\.mp3$', f)
        )
        durations = []
        titles = []
        for f in chapter_files:
            dur = subprocess.check_output([
                'ffprobe','-v','error','-show_entries','format=duration',
                '-of','default=noprint_wrappers=1:nokey=1',f
            ])
            durations.append(float(dur))
            titles.append(os.path.splitext('_'.join(f.split('_')[1:]))[0])

        subprocess.run(f'ffmpeg -y -i {output_name}.mp3 {output_name}.wav', shell=True, check=True)

        def fmt_time(seconds):
            hrs=int(seconds//3600); mins=int((seconds%3600)//60); secs=seconds%60
            return f'{hrs:02d}:{mins:02d}:{secs:06.3f}'

        opus_cmd = ['opusenc','--quiet','--bitrate','128','--title', output_name, '--artist', channel_name]
        if cover_filename:
            opus_cmd += ['--picture', cover_filename]
        start=0.0
        for idx,(dur,title) in enumerate(zip(durations, titles),1):
            opus_cmd += ['--comment', f'CHAPTER{idx:03d}={fmt_time(start)}']
            opus_cmd += ['--comment', f'CHAPTER{idx:03d}NAME={title}']
            start += dur
        opus_cmd += [f'{output_name}.wav', f'{output_name}.opus']
        print('Running opusenc to create OPUS file with chapters...')
        subprocess.run(' '.join(opus_cmd), shell=True, check=True)
        os.remove(f'{output_name}.wav')
        print('OPUS file created successfully.')
    except Exception as e:
        print('Failed to create OPUS file:', e)

    return f"✅ Merged audio saved as: {output_name}.mp3 and {output_name}.opus"

# Create ipywidgets for input
text_area = widgets.Textarea(
    value='',
    placeholder='Enter YouTube URLs separated by new lines',
    description='URLs:',
    layout={'width': '100%', 'height': '200px'}
)
process_button = widgets.Button(
    description='Download & Merge',
    button_style='success'
)
output_display = widgets.Output()

def on_button_click(b):
    with output_display:
        output_display.clear_output()
        print("Starting processing. Please wait...\n")
        try:
            result = list_to_mp3(text_area.value)
            print("\n" + result)
        except Exception as e:
            print("\nAn error occurred:", e)

process_button.on_click(on_button_click)
display(text_area, process_button, output_display)