<a href="https://colab.research.google.com/github/roscoekerby/python-projects/blob/main/Dynamic_Podcasts_Episodes_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Imports

In [17]:
!pip install selenium
!pip install chromedriver-autoinstaller



# **Scrape Apple**

In [44]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time

episode_titles = []
episode_descriptions = []
episode_apple_urls = []

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.binary_location = '/usr/bin/google-chrome'
    driver = webdriver.Chrome(options=options)
    return driver

def scroll_to_bottom(driver):
    SCROLL_PAUSE_TIME = 2
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def scrape_episodes(url):
    driver = init_driver()
    driver.get(url)

    try:
        # Scroll to load all episodes
        scroll_to_bottom(driver)

        # Find all episode elements
        episode_elements = driver.find_elements(By.CSS_SELECTOR, "li.svelte-8rlk6b")

        for episode in episode_elements:
            try:
                title = episode.find_element(By.CSS_SELECTOR, "span.episode-details__title-text").text
                link = episode.find_element(By.CSS_SELECTOR, "a.link-action").get_attribute('href')
                description = episode.find_element(By.CSS_SELECTOR, "p.episode-details__summary").text

                episode_titles.append(title)
                episode_apple_urls.append(link)
                episode_descriptions.append(description)

                print("Title:", title)
                print("URL:", link)
                print("Description:", description)
                print("----------")

            except NoSuchElementException:
                print("Failed to extract information for an episode")

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        driver.quit()

    return episode_titles, episode_apple_urls, episode_descriptions

# Example usage
if __name__ == "__main__":
    url = "https://podcasts.apple.com/za/podcast/the-muscle-growth-podcast/id1717906577/episodes"
    titles, urls, descriptions = scrape_episodes(url)
    print(f"Total episodes scraped: {len(titles)}")

Title: TMGP Ep 22 with Injury Coach, Chiropractor and Powerlifter Dr J’aime Goguen-Locke
URL: https://podcasts.apple.com/za/podcast/tmgp-ep-22-with-injury-coach-chiropractor-and/id1717906577?i=1000667179540
Description: Dr. J'aime Goguen is a dedicated chiropractor and injury coach with over 18 years of experience helping individuals recover from musculoskeletal injuries. She earned her Doctor of Chiropractic degree from Parker University and takes a holistic approach to treatment, incorporating techniques such as deep tissue myofascial release, active rehabilitation, and nutritional guidance to support her patients' recovery. As an injury coach, specializing in powerlifting injury management and prevention, Dr. J'aime emphasizes preventative care and works collaboratively with other healthcare professionals to provide comprehensive, tailored care for her patients. She is committed to patient education and empowerment, setting her apart as a leader in the field of chiropractic care. In

# **Scrape Spotify**



In [45]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time

# Function to initialize the WebDriver
def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')  # Run Chrome in headless mode
    options.add_argument('--no-sandbox')  # Bypass OS security model
    options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems
    driver = webdriver.Chrome(options=options)  # Correct usage: pass options to the constructor
    return driver

# Function to scroll to the top of the page
def scroll_to_top(driver):
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(2)  # Allow time for the scroll action to complete

# Function to click elements using CSS selectors
def click_element(driver, selector):
    try:
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
        )
        element.click()
        return True
    except Exception as e:
        print(f"Element not found or not clickable: {e}")
        return False

# Example usage
if __name__ == "__main__":
    url = "https://open.spotify.com/show/0TBy1wFQocjsf4MGqPVNKV"
    driver = init_driver()
    driver.get(url)

    try:
        # Step 3: Scroll to the top of the page
        scroll_to_top(driver)

        # Step 4: Scroll to the top of the page again (if necessary)
        scroll_to_top(driver)

        # Step 5-7: Click on elements using the specified CSS selector
        selector = '.LegacyChipInner__ChipInnerComponent-sc-1qguixk-0 > .encore-text'

        # Loop to click the element until it's no longer available or clickable
        click_attempts = 0
        max_clicks = 3  # Maximum number of clicks to attempt
        while click_attempts < max_clicks:
            if click_element(driver, selector):
                print(f"Click {click_attempts + 1} successful")
                click_attempts += 1
                time.sleep(5)  # Delay to ensure content loads after each click
            else:
                break

        # Find the element with data-testid="infinite-scroll-list"
        infinite_scroll_list = driver.find_element(By.CSS_SELECTOR, '[data-testid="infinite-scroll-list"]')

        # Within the infinite scroll list, find all anchor tags
        episode_links = infinite_scroll_list.find_elements(By.TAG_NAME, 'a')

        # Extract hrefs from each anchor tag
        episode_and_show_list = []
        for episode_link in episode_links:
            episode_href = episode_link.get_attribute('href')
            if episode_href and 'show' not in episode_href:
                print("Episode Link:", episode_href)
                episode_and_show_list.append(episode_href)

    finally:
        driver.quit()


Click 1 successful
Click 2 successful
Click 3 successful
Episode Link: https://open.spotify.com/episode/6tPwmGJ4WKh04U82hCtGM0
Episode Link: https://open.spotify.com/episode/6G77vKu5HMVjnxmNYWTpgm
Episode Link: https://open.spotify.com/episode/5WY6g608AjhfF8AqQatxLb
Episode Link: https://open.spotify.com/episode/2rUoYlIkv1SmDSWpZLhCWI
Episode Link: https://open.spotify.com/episode/4gAbW6QEJsltCvUsfd3NdW
Episode Link: https://open.spotify.com/episode/7ndsxdboK4dxWs5BvUFp4P
Episode Link: https://open.spotify.com/episode/72yX4xayGrxXaIQv8NgQ3S
Episode Link: https://open.spotify.com/episode/6oCz7AqlQDHu4ZWrn21kC3
Episode Link: https://open.spotify.com/episode/0sWyuKGDPXiI64ycdKO7ls
Episode Link: https://open.spotify.com/episode/58CgSrPor1Uh6mMu8z7WaI
Episode Link: https://open.spotify.com/episode/4gzfcWJ0lfGYcDwCKf0wol
Episode Link: https://open.spotify.com/episode/5VhoEetiuhb8C5khvN5jJz
Episode Link: https://open.spotify.com/episode/07Nr4mEGSerj9RMMjodO7C
Episode Link: https://open.spotif

In [21]:
episode_and_show_list

['https://open.spotify.com/episode/6tPwmGJ4WKh04U82hCtGM0',
 'https://open.spotify.com/episode/6G77vKu5HMVjnxmNYWTpgm',
 'https://open.spotify.com/episode/5WY6g608AjhfF8AqQatxLb',
 'https://open.spotify.com/episode/2rUoYlIkv1SmDSWpZLhCWI',
 'https://open.spotify.com/episode/4gAbW6QEJsltCvUsfd3NdW',
 'https://open.spotify.com/episode/7ndsxdboK4dxWs5BvUFp4P',
 'https://open.spotify.com/episode/72yX4xayGrxXaIQv8NgQ3S',
 'https://open.spotify.com/episode/6oCz7AqlQDHu4ZWrn21kC3',
 'https://open.spotify.com/episode/0sWyuKGDPXiI64ycdKO7ls',
 'https://open.spotify.com/episode/58CgSrPor1Uh6mMu8z7WaI',
 'https://open.spotify.com/episode/4gzfcWJ0lfGYcDwCKf0wol',
 'https://open.spotify.com/episode/5VhoEetiuhb8C5khvN5jJz',
 'https://open.spotify.com/episode/07Nr4mEGSerj9RMMjodO7C',
 'https://open.spotify.com/episode/7LDBCnUkObt3rr5ykEgyqP',
 'https://open.spotify.com/episode/0D6qjm9C5yCnZh1vhnxQH3',
 'https://open.spotify.com/episode/0iaYnuxhebYCiuEbESB7ng',
 'https://open.spotify.com/episode/5MEqZ

In [22]:
episode_urls = []
for url in episode_and_show_list:
  if "episode" in url:
    print(url)
    episode_urls.append(url)

https://open.spotify.com/episode/6tPwmGJ4WKh04U82hCtGM0
https://open.spotify.com/episode/6G77vKu5HMVjnxmNYWTpgm
https://open.spotify.com/episode/5WY6g608AjhfF8AqQatxLb
https://open.spotify.com/episode/2rUoYlIkv1SmDSWpZLhCWI
https://open.spotify.com/episode/4gAbW6QEJsltCvUsfd3NdW
https://open.spotify.com/episode/7ndsxdboK4dxWs5BvUFp4P
https://open.spotify.com/episode/72yX4xayGrxXaIQv8NgQ3S
https://open.spotify.com/episode/6oCz7AqlQDHu4ZWrn21kC3
https://open.spotify.com/episode/0sWyuKGDPXiI64ycdKO7ls
https://open.spotify.com/episode/58CgSrPor1Uh6mMu8z7WaI
https://open.spotify.com/episode/4gzfcWJ0lfGYcDwCKf0wol
https://open.spotify.com/episode/5VhoEetiuhb8C5khvN5jJz
https://open.spotify.com/episode/07Nr4mEGSerj9RMMjodO7C
https://open.spotify.com/episode/7LDBCnUkObt3rr5ykEgyqP
https://open.spotify.com/episode/0D6qjm9C5yCnZh1vhnxQH3
https://open.spotify.com/episode/0iaYnuxhebYCiuEbESB7ng
https://open.spotify.com/episode/5MEqZU122WFlOH1fJabtVQ
https://open.spotify.com/episode/4yzeLGkEA1dRNc2

# Concactenate Info from Apple and Spotify

Episode 7:

TMGP Ep 07 with sleep expert and biohacker Riley Jarvis

Riley is a sleep consultant who has helped high performers achieve more out of their personal and work life using cutting-edge scientific lab testing, strategies, and techniques.

With only so many waking hours in the day, the quality of hours sleeping is what can make the difference between peak performance and functioning at a fraction of your true potential.

Currently residing in Vancouver, Canada, Riley spends most of his time outdoors, hiking, snowboarding, and finding the next exciting adventure. Inside the “sleep lab”, he enjoys reading the latest sleep intel, collaborating with other thought leaders, and helping clients get life-changing results.

Riley is also a qualified Health Coach for Peak Performance & Flow and a qualified sleep consultant coach at TheSleepConsultant.Com

In today’s episode, we can look forward to advice about supplements for energy, focus, sleeping aids, general health and hormone supplements, tips on sleep, discussions about health, the importance of sleep, and biohacking through the use of cutting-edge science and medicine.

Apple:

https://podcasts.apple.com/za/podcast/tmgp-ep-07-with-sleep-expert-and-biohacker-riley-jarvis/id1717906577?i=1000643863544

Spotify:

https://open.spotify.com/episode/0iaYnuxhebYCiuEbESB7ng

In [46]:
def create_episode_info(episode_titles, episode_descriptions, episode_urls, episode_apple_urls):
    episodes = []

    for i in range(len(episode_titles)):
        episode_info = {
            "title": episode_titles[i],
            "description": episode_descriptions[i],
            "urls": {
                "spotify": episode_urls[i],
                "apple": episode_apple_urls[i]
            }
        }
        episodes.append(episode_info)

    return episodes

In [24]:
# episodes

In [25]:
print(episode_titles)
print(episode_descriptions)
print(episode_urls)
print(episode_apple_urls)

[]
[]
['https://open.spotify.com/episode/6tPwmGJ4WKh04U82hCtGM0', 'https://open.spotify.com/episode/6G77vKu5HMVjnxmNYWTpgm', 'https://open.spotify.com/episode/5WY6g608AjhfF8AqQatxLb', 'https://open.spotify.com/episode/2rUoYlIkv1SmDSWpZLhCWI', 'https://open.spotify.com/episode/4gAbW6QEJsltCvUsfd3NdW', 'https://open.spotify.com/episode/7ndsxdboK4dxWs5BvUFp4P', 'https://open.spotify.com/episode/72yX4xayGrxXaIQv8NgQ3S', 'https://open.spotify.com/episode/6oCz7AqlQDHu4ZWrn21kC3', 'https://open.spotify.com/episode/0sWyuKGDPXiI64ycdKO7ls', 'https://open.spotify.com/episode/58CgSrPor1Uh6mMu8z7WaI', 'https://open.spotify.com/episode/4gzfcWJ0lfGYcDwCKf0wol', 'https://open.spotify.com/episode/5VhoEetiuhb8C5khvN5jJz', 'https://open.spotify.com/episode/07Nr4mEGSerj9RMMjodO7C', 'https://open.spotify.com/episode/7LDBCnUkObt3rr5ykEgyqP', 'https://open.spotify.com/episode/0D6qjm9C5yCnZh1vhnxQH3', 'https://open.spotify.com/episode/0iaYnuxhebYCiuEbESB7ng', 'https://open.spotify.com/episode/5MEqZU122WFlOH1

In [26]:
len(episode_titles)

0

In [27]:
len(episode_descriptions)

0

In [28]:
len(episode_urls)

22

In [29]:
len(episode_apple_urls)

0

In [30]:
formatted_episodes = create_episode_info(episode_titles, episode_descriptions, episode_urls, episode_apple_urls)

In [31]:
formatted_episodes

[]

# Format into HTML for WEBSITE

In [47]:
for i in range(len(episode_titles)):
        print("<strong>Episode", len(episode_titles) - i, ":</strong><br>")
        print("<strong>" + episode_titles[i] + "</strong><br>")
        print("<br>" + episode_descriptions[i] + "<br>")
        print("<br><strong>Apple:</strong><br><a href='" + episode_apple_urls[i] + "'>" + episode_apple_urls[i] + "</a><br>")
        print("<br><strong>Spotify:</strong><br><a href='" + episode_urls[i] + "'>" + episode_urls[i] + "</a><br>")
        print("<br><br>")

<strong>Episode 22 :</strong><br>
<strong>TMGP Ep 22 with Injury Coach, Chiropractor and Powerlifter Dr J’aime Goguen-Locke</strong><br>
<br>Dr. J'aime Goguen is a dedicated chiropractor and injury coach with over 18 years of experience helping individuals recover from musculoskeletal injuries. She earned her Doctor of Chiropractic degree from Parker University and takes a holistic approach to treatment, incorporating techniques such as deep tissue myofascial release, active rehabilitation, and nutritional guidance to support her patients' recovery. As an injury coach, specializing in powerlifting injury management and prevention, Dr. J'aime emphasizes preventative care and works collaboratively with other healthcare professionals to provide comprehensive, tailored care for her patients. She is committed to patient education and empowerment, setting her apart as a leader in the field of chiropractic care. In addition to her work as a chiropractor, Dr. J'aime is an accomplished powerlif