In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

def scrape_course_content(url):
    # Set up Selenium with headless Chrome
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    
    # Create a webdriver instance; adjust executable_path if needed
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        # Load the webpage
        driver.get(url)
        
        # Wait for dynamic content to load; adjust the sleep time as needed
        time.sleep(5)
        
        # Get the page source (HTML)
        html = driver.page_source
        
    finally:
        # Ensure the driver is quit even if something fails
        driver.quit()
    
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Adjust the selector based on actual page content.
    # For example, assume that course content is contained within div elements with class 'course-content'.
    course_contents = soup.find_all('div', class_='course-content')
    
    content_list = []
    for content in course_contents:
        # Extract and clean the text content
        text = content.get_text(separator="\n", strip=True)
        content_list.append(text)
    
    return content_list

if __name__ == "__main__":
    url = "https://tds.s-anand.net/#/2025-01/"
    course_data = scrape_course_content(url)
    
    # Print the scraped content
    if course_data:
        for idx, content in enumerate(course_data, start=1):
            print(f"--- Content Block {idx} ---")
            print(content, "\n")
    else:
        print("No course content found. Check the element selectors and inspect the page's HTML structure.")

Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


No course content found. Check the element selectors and inspect the page's HTML structure.


In [4]:
from selenium import webdriver
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

def scrape_course_content(url):
    # Set up Selenium with headless Edge
    edge_options = EdgeOptions()
    edge_options.use_chromium = True  # Ensure you're using the Chromium version of Edge
    edge_options.add_argument("--headless")
    edge_options.add_argument("--disable-gpu")
    
    # Create a webdriver instance; if EdgeDriver is not in your PATH, pass the executable_path
    driver = webdriver.Edge(options=edge_options)
    
    try:
        # Load the webpage
        driver.get(url)
        
        # Wait until a specific element appears; update the selector as needed.
        try:
            element = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.course-content"))
            )
        except Exception as e:
            print("Timeout or element not found. Captured page source for debugging.")
            with open('debug_page_source.html', 'w', encoding='utf-8') as f:
                f.write(driver.page_source)
            print("Page source written to debug_page_source.html")
            return []
        
        # Save a screenshot for debugging purposes.
        driver.save_screenshot("debug_screenshot.png")
        print("Screenshot captured as debug_screenshot.png")
        
        # Additional pause for dynamic content if necessary.
        time.sleep(2)
        html = driver.page_source
    finally:
        driver.quit()
    
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Try the first selector
    course_contents = soup.find_all('div', class_='course-content')
    if not course_contents:
        # Alternative selector if needed (adjust based on your inspection)
        course_contents = soup.find_all('section', id='main-content')
    
    content_list = []
    for content in course_contents:
        text = content.get_text(separator="\n", strip=True)
        content_list.append(text)
    
    return content_list

if __name__ == "__main__":
    url = "https://tds.s-anand.net/#/2025-01/"
    course_data = scrape_course_content(url)
    
    if course_data:
        for idx, content in enumerate(course_data, start=1):
            print(f"--- Content Block {idx} ---")
            print(content, "\n")
    else:
        print("No course content found. Check the element selectors and inspect the page's HTML structure.")

Timeout or element not found. Captured page source for debugging.
Page source written to debug_page_source.html
No course content found. Check the element selectors and inspect the page's HTML structure.
