In [1]:
import json
import time
from datetime import datetime, timezone  # This import is important
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# --- Configuration ---
COURSE_WEBSITE_URL = "https://tds.s-anand.net/#/2025-01/"
OUTPUT_FILENAME = "tds_course_content_deep.json"

# REMOVED: The helper function is no longer needed to prevent the NameError.
# def get_utc_now_iso():
#     """Returns the current UTC time in ISO 8601 format."""
#     return datetime.now(timezone.utc).isoformat()

def scrape_page_content(soup, url, module_name):
    """
    A helper function to scrape chunks from a single loaded page.
    """
    documents = []
    article = soup.find('article', id='main')
    if not article:
        print(f"  -> ERROR: Could not find the <article id='main'> tag on page {url}. Skipping.")
        return []

    page_title = article.find('h1').get_text(strip=True) if article.find('h1') else "Untitled"
    page_chunks = 0
    
    for element in article.find_all(['p', 'li', 'tr', 'h2', 'h3']):
        content_text = element.get_text(strip=True)
        if len(content_text) > 10:
            page_chunks += 1
            documents.append({
                "source_url": url,
                "source_type": "course_content",
                "title": page_title,
                "content": content_text,
                "metadata": {
                    # MODIFIED: Placed the datetime logic directly here.
                    "scraped_at_utc": datetime.now(timezone.utc).isoformat(),
                    "module": module_name
                }
            })
    print(f"  -> Found and processed {page_chunks} text chunks.")
    return documents

def scrape_course_content():
    """
    Performs a deep scrape of the course website, including all nested submodule pages.
    """
    print("--- [Course Scraper] Starting Deep Scrape ---")
    
    all_documents = []
    driver = None
    try:
        service = Service(EdgeChromiumDriverManager().install())
        options = Options()
        options.add_argument('--headless')
        options.add_argument("window-size=1920,1080")
        options.add_argument("--log-level=3")
        driver = webdriver.Edge(service=service, options=options)
        print("[Course Scraper] WebDriver initialized.")

        print(f"[Course Scraper] Navigating to home page: {COURSE_WEBSITE_URL}")
        driver.get(COURSE_WEBSITE_URL)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "aside.sidebar")))
        print("[Course Scraper] Home page loaded.")
        time.sleep(2)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        main_module_links = []
        for link in soup.select('aside.sidebar .sidebar-nav > ul > li.folder > a'):
            if link.has_attr('title') and link.has_attr('href'):
                main_module_links.append({
                    'name': link['title'],
                    'href': link['href']
                })
        
        if not main_module_links:
            print("[Course Scraper] CRITICAL: No main module links found. Exiting.")
            return []
        
        print(f"[Course Scraper] Found {len(main_module_links)} main modules to process.")

        for i, module in enumerate(main_module_links):
            module_name = module['name']
            module_url = "https://tds.s-anand.net/" + module['href']
            print(f"\n[Course Scraper] ({i+1}/{len(main_module_links)}) Processing MAIN MODULE: '{module_name}'")
            
            driver.get(module_url)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "article#main h1")))
            time.sleep(1)
            
            print(f"  -> Scraping landing page for '{module_name}'...")
            current_soup = BeautifulSoup(driver.page_source, 'html.parser')
            all_documents.extend(scrape_page_content(current_soup, module_url, module_name))

            submodule_links = []
            for sub_link in current_soup.select('aside.sidebar li.folder.active > ul > li > a'):
                 if sub_link.has_attr('title') and sub_link.has_attr('href'):
                    submodule_links.append({
                        'name': sub_link['title'],
                        'href': sub_link['href']
                    })

            if submodule_links:
                print(f"  -> Found {len(submodule_links)} submodules. Scraping them now...")
                for j, submodule in enumerate(submodule_links):
                    submodule_name = submodule['name']
                    submodule_url = "https://tds.s-anand.net/" + submodule['href']
                    print(f"    - ({j+1}/{len(submodule_links)}) Scraping SUBMODULE: '{submodule_name}'")
                    
                    driver.get(submodule_url)
                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "article#main h1")))
                    
                    submodule_soup = BeautifulSoup(driver.page_source, 'html.parser')
                    all_documents.extend(scrape_page_content(submodule_soup, submodule_url, module_name))
    
    except Exception as e:
        print(f"[Course Scraper] An unexpected error occurred: {e}")
    finally:
        if driver:
            driver.quit()
        
    print(f"\n--- [Course Scraper] Finished Deep Scrape. Found a total of {len(all_documents)} chunks. ---")
    return all_documents


if __name__ == "__main__":
    print("="*50)
    print("Starting the TDS Course Content DEEP scraping process...")
    # MODIFIED: Placed the datetime logic directly here.
    print(f"Current UTC time: {datetime.now(timezone.utc).isoformat()}")
    print("="*50)
    
    course_documents = scrape_course_content()
    
    if not course_documents:
        print("\nCRITICAL WARNING: The final dataset is empty. Please check the logs.")
    else:
        with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
            json.dump(course_documents, f, indent=2, ensure_ascii=False)
        print(f"\n✅ Success! Deep scrape data saved to '{OUTPUT_FILENAME}'. Total chunks: {len(course_documents)}")

Starting the TDS Course Content DEEP scraping process...
Current UTC time: 2025-06-11T02:57:04.904119+00:00
--- [Course Scraper] Starting Deep Scrape ---
[Course Scraper] WebDriver initialized.
[Course Scraper] Navigating to home page: https://tds.s-anand.net/#/2025-01/
[Course Scraper] Home page loaded.
[Course Scraper] Found 8 main modules to process.

[Course Scraper] (1/8) Processing MAIN MODULE: 'Development Tools'
  -> Scraping landing page for 'Development Tools'...
  -> Found and processed 3 text chunks.

[Course Scraper] (2/8) Processing MAIN MODULE: 'Deployment Tools'
  -> Scraping landing page for 'Deployment Tools'...
  -> Found and processed 1 text chunks.

[Course Scraper] (3/8) Processing MAIN MODULE: 'Large Language Models'
  -> Scraping landing page for 'Large Language Models'...
  -> Found and processed 14 text chunks.

[Course Scraper] (4/8) Processing MAIN MODULE: 'Data Sourcing'
  -> Scraping landing page for 'Data Sourcing'...
  -> Found and processed 14 text chun