In [None]:
import os
import requests
from bs4 import BeautifulSoup

def extract_archive_urls(html):
    soup = BeautifulSoup(html, 'html.parser')
    urls = []
    for a in soup.find_all('a', href=True):
        if 'archive.org/download' in a['href']:
            urls.append(a['href'])
    return urls

def download_file(url, dest_folder, file_name):
    file_path = os.path.join(dest_folder, file_name)
    if not os.path.exists(file_path):
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            print(f"Downloaded {file_name}")
        else:
            print(f"Failed to download {url}")
    else:
        print(f"{file_name} already exists. Skipping download.")

def process_directories(base_path):
    for dir_name in os.listdir(base_path):
        dir_path = os.path.join(base_path, dir_name)
        if os.path.isdir(dir_path):
            urls_file_path = os.path.join(dir_path, 'urls.html')
            mp3parts_path = os.path.join(dir_path, 'mp3parts')
            
            if os.path.exists(urls_file_path) and os.path.isdir(mp3parts_path):
                with open(urls_file_path, 'r', encoding='utf-8') as file:
                    html_content = file.read()
                
                urls = extract_archive_urls(html_content)
                
                for idx, url in enumerate(urls):
                    file_name = f"{idx + 1:03d}.mp3"
                    download_file(url, mp3parts_path, file_name)

# Set the base path to the 'raw_data' directory
base_path = "./raw_data"

# Process the directories
process_directories(base_path)
