<a href="https://colab.research.google.com/github/roscoekerby/google-colab/blob/main/Dynamic_Podcasts_Episodes_Scraper_2025_04_17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Imports

In [21]:
!pip install selenium
!pip install chromedriver-autoinstaller



In [22]:
!apt-get update
!apt-get install -y wget unzip
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get -f install -y
!wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip
!unzip chromedriver_linux64.zip
!mv chromedriver /usr/local/bin/
!chmod +x /usr/local/bin/chromedriver

0% [Working]            Get:1 https://dl.google.com/linux/chrome/deb stable InRelease [1,825 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.18                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Waiting for header                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://dl.google.com/linux/chrome/deb stable/main amd64 Packages [1,216 B]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:1

In [33]:
!pip install webdriver-manager



# **Scrape Apple 2025-04-17**

In [24]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
import time
import json

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.binary_location = '/usr/bin/google-chrome'

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    driver.set_window_size(1920, 1080)  # Set a larger window size
    return driver

def wait_for_episodes(driver, current_count):
    try:
        WebDriverWait(driver, 10).until(
            lambda x: len(x.find_elements(By.CSS_SELECTOR, "li.svelte-8rlk6b")) > current_count
        )
        return True
    except TimeoutException:
        return False

def scroll_to_bottom(driver):
    previous_episode_count = 0
    no_new_content_count = 0
    max_attempts = 10  # Increased max attempts

    while no_new_content_count < max_attempts:
        # Get current episode count
        current_episodes = driver.find_elements(By.CSS_SELECTOR, "li.svelte-8rlk6b")
        current_count = len(current_episodes)

        if current_count == previous_episode_count:
            no_new_content_count += 1
        else:
            no_new_content_count = 0

        # Scroll in smaller increments
        if current_episodes:
            for i in range(max(0, len(current_episodes)-5), len(current_episodes)):
                try:
                    driver.execute_script("arguments[0].scrollIntoView(true);", current_episodes[i])
                    time.sleep(0.5)
                except:
                    continue

        time.sleep(2)

        new_content_loaded = wait_for_episodes(driver, current_count)
        if not new_content_loaded:
            no_new_content_count += 1

        previous_episode_count = current_count
        print(f"Current episode count: {current_count}")

def extract_episode_info(episode, index):
    """Extract information from an episode element with enhanced retry logic."""
    max_retries = 5  # Increased retries
    for attempt in range(max_retries):
        try:
            # Updated selectors based on the actual HTML structure
            title_element = WebDriverWait(episode, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "span.episode-details__title-text"))
            )
            title = title_element.text

            link_element = episode.find_element(By.CSS_SELECTOR, "a.link-action")
            link = link_element.get_attribute('href')

            # Updated selector for description - looking for the div instead of p tag
            description_element = episode.find_element(By.CSS_SELECTOR, "div.episode-details__summary .multiline-clamp__text")
            description = description_element.text

            # Get duration if available
            try:
                duration_element = episode.find_element(By.CSS_SELECTOR, "div.progress-time")
                duration = duration_element.text
            except NoSuchElementException:
                duration = "Unknown"

            # Get publication date if available
            try:
                date_element = episode.find_element(By.CSS_SELECTOR, "p.episode-details__published-date")
                publish_date = date_element.text
            except NoSuchElementException:
                publish_date = "Unknown"

            if not all([title, link]):  # We'll accept missing description as some episodes might not have one
                raise NoSuchElementException("Missing title or link data")

            return {
                'index': index,
                'title': title,
                'link': link,
                'description': description,
                'duration': duration,
                'publish_date': publish_date
            }
        except (StaleElementReferenceException, NoSuchElementException) as e:
            if attempt == max_retries - 1:
                print(f"Failed to extract episode at index {index} after {max_retries} attempts: {str(e)}")
            time.sleep(1)
            continue
    return None

def scrape_episodes(url):
    episode_data = []
    driver = init_driver()

    try:
        driver.get(url)

        # Wait for initial load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li.svelte-8rlk6b"))
        )

        # Take a screenshot to debug
        driver.save_screenshot("initial_load.png")

        # Scroll to load all episodes
        scroll_to_bottom(driver)

        # Get all episodes after scrolling
        episode_elements = driver.find_elements(By.CSS_SELECTOR, "li.svelte-8rlk6b")
        print(f"Total episodes found: {len(episode_elements)}")

        # Extract information from each episode
        for index, episode in enumerate(episode_elements):
            episode_info = extract_episode_info(episode, index)
            if episode_info:
                episode_data.append(episode_info)
                print(f"Scraped ({index + 1}/{len(episode_elements)}): {episode_info['title']}")
            else:
                print(f"Failed to scrape episode at index {index}")

        # Sort episodes by index to maintain order
        episode_data.sort(key=lambda x: x['index'])

        # Check for missing episodes
        titles = [ep['title'] for ep in episode_data]
        for i in range(1, 40):  # Adjust range based on expected episode count
            episode_num = f"TMGP Ep {str(i).zfill(2)}"
            if not any(episode_num in title for title in titles):
                print(f"WARNING: Potentially missing {episode_num}")

        # Save data to JSON file
        with open("podcast_episodes.json", "w", encoding="utf-8") as f:
            json.dump(episode_data, f, ensure_ascii=False, indent=4)

        print(f"Successfully saved data to podcast_episodes.json")

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        driver.quit()

    return episode_data

if __name__ == "__main__":
    url = "https://podcasts.apple.com/za/podcast/the-muscle-growth-podcast/id1717906577/episodes"
    apple_episodes = scrape_episodes(url)
    print(f"\nTotal episodes successfully scraped: {len(apple_episodes)}")

    # Print all episode titles in order
    print("\nAll scraped episodes:")
    for ep in apple_episodes:
        print('*' * 80)
        print(f"Title: {ep['title']}")
        print(f"Link: {ep['link']}")
        print(f"Duration: {ep['duration']}")
        print(f"Published: {ep['publish_date']}")
        print(f"Description: {ep['description'][:150]}...")  # Print first 150 chars of description
        print('*' * 80)

Current episode count: 25
Current episode count: 38
Current episode count: 38
Current episode count: 38
Current episode count: 38
Current episode count: 38
Current episode count: 38
Total episodes found: 38
Scraped (1/38): TMGP Ep 38 with NO-BS strength and fitness coach Cillian O’Connor AKA DYSFUNCTIONALPATTERNS
Scraped (2/38): TMGP Ep 37 with men’s physique winner, bodybuilder, and lifestyle coach - Jonathan Kantor
Scraped (3/38): TMGP Ep 36 with Emanuel Pescari: Austria’s Strongest Man, MULTIPLE WORLD RECORD TITLE HOLDER & INTERNATIONAL STRONGMAN SUPERSTAR Competitor part 2
Scraped (4/38): TMGP Ep 35 with Emanuel Pescari: Austria’s Strongest Man and Elite Strongman Competitor part 1
Scraped (5/38): TMGP Ep 34 with health coach, wellness consultant mindset expert Jeffrey Siegel part 2
Scraped (6/38): TMGP Ep 33 with health coach, wellness consultant, and mindset expert Jeffrey Siegel
Scraped (7/38): TMGP Ep 32 with health coach and diabetes expert Ghamdan Al-Areeky
Scraped (8/38): TM

# **Scrape Spotify**

In [25]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
import time
import json
import os
import re
from datetime import datetime

def init_driver():
    """Initialize and return a web driver instance."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--incognito')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    # Disable webdriver mode to avoid detection
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # Execute CDP commands to prevent detection
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'})
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
        Object.defineProperty(navigator, 'webdriver', {
          get: () => undefined
        })
      """
    })

    return driver

def click_load_more(driver, max_attempts=15):
    """Click 'Load more episodes' button."""
    attempt = 0

    while attempt < max_attempts:
        try:
            # Look specifically for the "Load more episodes" button at the bottom
            load_more_buttons = driver.find_elements(By.XPATH, "//button[contains(., 'Load more episodes')]")

            if not load_more_buttons:
                print("No 'Load more episodes' button found, all episodes may be loaded")
                return False

            button = load_more_buttons[0]
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
            time.sleep(1)

            # Take a screenshot before clicking
            driver.save_screenshot(f"before_click_{attempt}.png")

            driver.execute_script("arguments[0].click();", button)
            print(f"Clicked 'Load more episodes' button (attempt {attempt+1})")

            # Wait for new content to load
            time.sleep(3)
            attempt += 1

        except Exception as e:
            print(f"Error clicking 'Load more episodes' button: {e}")
            attempt += 1
            time.sleep(2)

    return False

def extract_episodes(driver):
    """Extract podcast episodes using the correct selectors from the HTML structure."""
    spotify_episodes = []

    try:
        # Click the load more button until all episodes are loaded
        click_load_more(driver)

        # Save the page source for debugging
        with open("spotify_page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)

        # Take a screenshot after loading episodes
        driver.save_screenshot("after_loading_episodes.png")

        # Find all episode containers
        episode_containers = driver.find_elements(By.CSS_SELECTOR, "div.T9iBYqbERZHdwDl0U2tC")
        print(f"Found {len(episode_containers)} episode containers")

        for i, container in enumerate(episode_containers):
            try:
                episode_data = {}

                # Extract title - this selector matches what's in your HTML
                try:
                    title_elem = container.find_element(By.CSS_SELECTOR, "span.episode-details__title-text, span.ListRowTitle__LineClamp-sc-1xe2if1-0")
                    episode_data['title'] = title_elem.text.strip()
                except NoSuchElementException:
                    # Try alternative selector
                    try:
                        title_elem = container.find_element(By.CSS_SELECTOR, "a[data-encore-id='listRowTitle']")
                        episode_data['title'] = title_elem.text.strip()
                    except:
                        episode_data['title'] = "Title not found"

                # Extract link
                try:
                    link_elem = container.find_element(By.CSS_SELECTOR, "a.link-action, a[data-encore-id='listRowTitle']")
                    episode_data['url'] = link_elem.get_attribute('href')
                except:
                    episode_data['url'] = ""

                # Extract description
                try:
                    desc_elem = container.find_element(By.CSS_SELECTOR, "span.ListRowDetails__LineClamp-sc-sozu4l-1")
                    episode_data['description'] = desc_elem.text.strip()
                except:
                    try:
                        # Alternative selector for description
                        desc_elem = container.find_element(By.CSS_SELECTOR, "div.episode-details__summary, p[data-encore-id='listRowDetails']")
                        episode_data['description'] = desc_elem.text.strip()
                    except:
                        episode_data['description'] = "Description not found"

                # Extract date and duration
                try:
                    date_elem = container.find_element(By.CSS_SELECTOR, "p.episode-details__published-date, p._q93agegdE655O5zPz6l")
                    episode_data['published_date'] = date_elem.text.strip()
                except:
                    episode_data['published_date'] = ""

                try:
                    duration_elem = container.find_element(By.CSS_SELECTOR, "div.progress-time, span.UyzJidwrGk3awngSGIwv")
                    episode_data['duration'] = duration_elem.text.strip()
                except:
                    episode_data['duration'] = ""

                # Extract explicit tag if it exists
                try:
                    explicit = container.find_element(By.CSS_SELECTOR, "span[aria-label='Explicit'], span.SgFtsvn3upY_tG6mnt4n")
                    episode_data['explicit'] = True
                except:
                    episode_data['explicit'] = False

                # Extract episode number
                if 'title' in episode_data and episode_data['title'] != "Title not found":
                    ep_match = re.search(r'(?i)ep(?:isode)?\s*(\d+)', episode_data['title'])
                    if ep_match:
                        episode_data['episode_number'] = ep_match.group(1)
                    else:
                        episode_data['episode_number'] = ""

                spotify_episodes.append(episode_data)
                print(f"Extracted episode {i+1}: {episode_data['title']}")

            except Exception as e:
                print(f"Error extracting data for episode {i+1}: {e}")

    except Exception as e:
        print(f"Error in extract_episodes: {e}")

    return spotify_episodes

def save_to_json(data, filename="spotify_podcast_episodes.json"):
    """Save the extracted data to a JSON file."""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data successfully saved to {filename}")
        return True
    except Exception as e:
        print(f"Error saving data to file: {e}")
        return False

def scrape_spotify_podcast(url):
    """Main function to scrape podcast episodes."""
    driver = None
    try:
        print(f"Starting to scrape podcast episodes from: {url}")
        driver = init_driver()

        # Open the URL
        driver.get(url)
        print("Page loaded successfully")

        # Wait for the page to load completely
        time.sleep(10)

        # Take a screenshot for debugging
        driver.save_screenshot("spotify_page_initial.png")

        # Extract episode data
        spotify_episodes = extract_episodes(driver)

        # Save the data to a JSON file
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"spotify_podcast_episodes_{current_time}.json"
        save_to_json(spotify_episodes, filename)

        # Print summary
        print(f"\nTotal episodes scraped: {len(spotify_episodes)}")
        if spotify_episodes:
            print("\nFirst episode details:")
            for key, value in spotify_episodes[0].items():
                if key == 'description':
                    print(f"{key}: {value[:100]}..." if len(value) > 100 else f"{key}: {value}")
                else:
                    print(f"{key}: {value}")

            print("\nLast episode details:")
            for key, value in spotify_episodes[-1].items():
                if key == 'description':
                    print(f"{key}: {value[:100]}..." if len(value) > 100 else f"{key}: {value}")
                else:
                    print(f"{key}: {value}")

        return spotify_episodes

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

    finally:
        if driver:
            driver.quit()
            print("Browser closed")

if __name__ == "__main__":
    # The URL of your podcast
    podcast_url = "https://open.spotify.com/show/0TBy1wFQocjsf4MGqPVNKV"
    spotify_episodes = scrape_spotify_podcast(podcast_url)

    if spotify_episodes:
        print(f"\nSuccessfully scraped {len(spotify_episodes)} episodes.")
    else:
        print("Failed to scrape episodes.")

Starting to scrape podcast episodes from: https://open.spotify.com/show/0TBy1wFQocjsf4MGqPVNKV
Page loaded successfully
Clicked 'Load more episodes' button (attempt 1)
Clicked 'Load more episodes' button (attempt 2)
Clicked 'Load more episodes' button (attempt 3)
Clicked 'Load more episodes' button (attempt 4)
Clicked 'Load more episodes' button (attempt 5)
Clicked 'Load more episodes' button (attempt 6)
No 'Load more episodes' button found, all episodes may be loaded
Found 38 episode containers
Extracted episode 1: TMGP Ep 38 with NO-BS strength and fitness coach Cillian O’Connor AKA DYSFUNCTIONALPATTERNS
Extracted episode 2: TMGP Ep 37 with men’s physique winner, bodybuilder, and lifestyle coach - Jonathan Kantor
Extracted episode 3: TMGP Ep 36 with Emanuel Pescari: Austria’s Strongest Man, MULTIPLE WORLD RECORD TITLE HOLDER & INTERNATIONAL STRONGMAN SUPERSTAR Competitor part 2
Extracted episode 4: TMGP Ep 35 with Emanuel Pescari: Austria’s Strongest Man and Elite Strongman Competito

In [35]:
apple_episodes

[{'index': 0,
  'title': 'TMGP Ep 38 with NO-BS strength and fitness coach Cillian O’Connor AKA DYSFUNCTIONALPATTERNS',
  'link': 'https://podcasts.apple.com/za/podcast/tmgp-ep-38-with-no-bs-strength-and-fitness-coach/id1717906577?i=1000703079821',
  'description': "Cillian O'Connor, known by his Instagram handle @dysfunctionalpatterns, is a coach who combines critical analysis with sharp satire to examine popular fitness methodologies — most notably Functional Patterns. His content highlights and parodies what he sees as the absurdities, inconsistencies, and questionable practices within these systems, encouraging a more skeptical and evidence-based approach to training. Beyond the satire, Cillian focuses on helping men get stronger, build muscle, and manage pain with no-BS, practical training methods. In this episode, we dive into the importance of critical thinking in the fitness space, the risks of blindly following trending methodologies, and why grounding your training in science

In [36]:
spotify_episodes

[{'title': 'TMGP Ep 38 with NO-BS strength and fitness coach Cillian O’Connor AKA DYSFUNCTIONALPATTERNS',
  'url': 'https://open.spotify.com/episode/7iNTGVONthOQBlkIMpJuma',
  'description': 'The MUSCLE GROWTH Podcast',
  'published_date': 'Apr 10',
  'duration': '1 hr 24 min',
  'explicit': True,
  'episode_number': '38'},
 {'title': 'TMGP Ep 37 with men’s physique winner, bodybuilder, and lifestyle coach - Jonathan Kantor',
  'url': 'https://open.spotify.com/episode/2zEhXqqhndisCf7dhvT9wf',
  'description': 'The MUSCLE GROWTH Podcast',
  'published_date': 'Mar 27',
  'duration': '1 hr 34 min',
  'explicit': True,
  'episode_number': '37'},
 {'title': 'TMGP Ep 36 with Emanuel Pescari: Austria’s Strongest Man, MULTIPLE WORLD RECORD TITLE HOLDER & INTERNATIONAL STRONGMAN SUPERSTAR Competitor part 2',
  'url': 'https://open.spotify.com/episode/5cVBijng6H9VTnVIatyMXs',
  'description': 'The MUSCLE GROWTH Podcast',
  'published_date': 'Mar 13',
  'duration': '1 hr 4 min',
  'explicit': Fal

In [37]:
spotify_episodes[0]['url']

'https://open.spotify.com/episode/7iNTGVONthOQBlkIMpJuma'

In [39]:
spotify_episode_urls = []
for ep in spotify_episodes:
    if "url" in ep:
        url = ep["url"]
        print(url)
        spotify_episode_urls.append(url)

https://open.spotify.com/episode/7iNTGVONthOQBlkIMpJuma
https://open.spotify.com/episode/2zEhXqqhndisCf7dhvT9wf
https://open.spotify.com/episode/5cVBijng6H9VTnVIatyMXs
https://open.spotify.com/episode/3VfywzPhiwlkmnbJOfyfQL
https://open.spotify.com/episode/5ygZIZd7fD2gVNByDzEmVv
https://open.spotify.com/episode/6yu3QR20x44rqqLbUWDTIO
https://open.spotify.com/episode/2MNn8Dhv0xUdFdn76plSwG
https://open.spotify.com/episode/1eo2qrTosIrxgp7NQ5lmfZ
https://open.spotify.com/episode/1wE5BgzIfFYXhn49UKHIgF
https://open.spotify.com/episode/51sHHFWSjSgAXSEBPLdOQi
https://open.spotify.com/episode/4AoZQaJ3l23CctCgCDKtFp
https://open.spotify.com/episode/5D0zyD9TfHsyWU8OaYhg7b
https://open.spotify.com/episode/2G9YKjM1733RO7tMJJrMMf
https://open.spotify.com/episode/2ufYLgS9o9OMhF5Bb4ChQX
https://open.spotify.com/episode/5sE1XbZHuRINseqmTx7Uqz
https://open.spotify.com/episode/0G735vBGDMmXi6Su2GoFFu
https://open.spotify.com/episode/6tPwmGJ4WKh04U82hCtGM0
https://open.spotify.com/episode/6G77vKu5HMVjnxm

In [41]:
apple_episode_urls = []
for ep in apple_episodes:
    if "link" in ep:
        link = ep["link"]
        print(link)
        apple_episode_urls.append(link)

https://podcasts.apple.com/za/podcast/tmgp-ep-38-with-no-bs-strength-and-fitness-coach/id1717906577?i=1000703079821
https://podcasts.apple.com/za/podcast/tmgp-ep-37-with-mens-physique-winner-bodybuilder/id1717906577?i=1000701153703
https://podcasts.apple.com/za/podcast/tmgp-ep-36-with-emanuel-pescari-austrias-strongest-man/id1717906577?i=1000698999938
https://podcasts.apple.com/za/podcast/tmgp-ep-35-with-emanuel-pescari-austrias-strongest-man/id1717906577?i=1000696511998
https://podcasts.apple.com/za/podcast/tmgp-ep-34-with-health-coach-wellness-consultant/id1717906577?i=1000691768471
https://podcasts.apple.com/za/podcast/tmgp-ep-33-with-health-coach-wellness-consultant/id1717906577?i=1000687078251
https://podcasts.apple.com/za/podcast/tmgp-ep-32-with-health-coach-and-diabetes-expert/id1717906577?i=1000684288827
https://podcasts.apple.com/za/podcast/tmgp-ep-31-with-the-world-renowned-muscle-researcher/id1717906577?i=1000682391409
https://podcasts.apple.com/za/podcast/tmgp-ep-30-with-th

# Concactenate Info from Apple and Spotify

In [43]:
def create_episode_info(apple_episodes, spotify_episode_urls):
    episodes = []

    # Ensure matching lengths between apple episodes and spotify urls
    if len(apple_episodes) != len(spotify_episode_urls):
        print("Warning: Mismatched number of Apple episodes and Spotify URLs.")
        return []

    for i in range(len(apple_episodes)):
        episode_info = {
            "title": apple_episodes[i]["title"],
            "description": apple_episodes[i]["description"],
            "urls": {
                "spotify": spotify_episode_urls[i],
                "apple": apple_episodes[i]["link"]
            }
        }
        episodes.append(episode_info)

    return episodes

episodes = create_episode_info(apple_episodes, spotify_episode_urls)


# Format into HTML for WEBSITE

In [44]:
def print_formatted_episodes_html(episodes):
    html_output = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Episodes</title>
        <style>
            body {
                font-family: Arial, sans-serif;
            }
            .episode-card {
                border: 1px solid #ddd;
                padding: 20px;
                margin: 10px 0;
                border-radius: 8px;
                box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
            }
            .episode-title {
                font-size: 1.5em;
                font-weight: bold;
                margin-bottom: 10px;
            }
            .episode-subtitle {
                font-size: 1.2em;
                font-weight: bold;
                color: #333;
                margin: 5px 0;
            }
            .episode-description {
                font-size: 1em;
                line-height: 1.6;
                margin: 10px 0;
            }
            .platform-link {
                font-weight: bold;
                text-decoration: none;
                margin-right: 10px;
            }
            .apple-link {
                color: #0073e6;
            }
            .spotify-link {
                color: #1DB954;
            }
        </style>
    </head>
    <body>
    <div id="episodes-container">
    """

    # Generate HTML for each episode in reverse order
    for i, episode in enumerate(reversed(episodes), 1):  # Latest to oldest order
        title = episode["title"]
        description = episode["description"]
        apple_url = episode["urls"]["apple"]
        spotify_url = episode["urls"]["spotify"]

        html_output += f"""
        <div class="episode-card">
            <div class="episode-title">Episode {i}</div>
            <div class="episode-subtitle">{title}</div>
            <div class="episode-description">{description}</div>
            <p>
                <a href="{apple_url}" target="_blank" class="platform-link apple-link">
                    🍏 Listen on Apple
                </a>
                <a href="{spotify_url}" target="_blank" class="platform-link spotify-link">
                    🎶 Listen on Spotify
                </a>
            </p>
        </div>
        """

    # Close the main container and body tags
    html_output += """
    </div>
    </body>
    </html>
    """

    print(html_output)

# Example usage:
episodes = create_episode_info(apple_episodes, spotify_episode_urls)
print_formatted_episodes_html(episodes)



    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Episodes</title>
        <style>
            body {
                font-family: Arial, sans-serif;
            }
            .episode-card {
                border: 1px solid #ddd;
                padding: 20px;
                margin: 10px 0;
                border-radius: 8px;
                box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
            }
            .episode-title {
                font-size: 1.5em;
                font-weight: bold;
                margin-bottom: 10px;
            }
            .episode-subtitle {
                font-size: 1.2em;
                font-weight: bold;
                color: #333;
                margin: 5px 0;
            }
            .episode-description {
                font-size: 1em;
                line-height: 1.6;
                margin: 10px 0;
            