In [1]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import warnings

# Suppress the UserWarning from BeautifulSoup if no parser is explicitly specified
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# --- Configuration ---
# BASE_URL: The main URL shown in the screenshot for "Tools in Data Science - Jan 2025"
# Replace with the actual URL of your IIT Madras TDS course content page.
# For example: "https://online.iitm.ac.in/courses/tds/jan2025" (This is a placeholder)
BASE_URL = "https://tds.s-anand.net/#/2025-01/"

# Output directory to save scraped course content
COURSE_CONTENT_OUTPUT_DIR = "scraped_tds_course_content"

# Delay between requests (to be polite and avoid being blocked)
REQUEST_DELAY = 1.5 # seconds

# --- Helper Functions ---

def get_page_content(url):
    """Fetches the content of a given URL."""
    try:
        print(f"Fetching: {url}")
        response = requests.get(url, timeout=15) # Increased timeout
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        time.sleep(REQUEST_DELAY) # Be polite
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def save_content(filename, content, directory=COURSE_CONTENT_OUTPUT_DIR):
    """Saves content to a file in the specified directory."""
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, filename)
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Saved: {filepath}")
        return True
    except IOError as e:
        print(f"Error saving {filepath}: {e}")
        return False

def clean_html_text(html_content):
    """Removes common HTML tags and cleans up text, focusing on main content."""
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove script, style, and common navigation/layout elements
    for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'img', 'svg']):
        element.decompose()

    text = soup.get_text(separator='\n')

    # Remove excessive whitespace, newlines, and tabs
    text = re.sub(r'\n+', '\n', text) # Replace multiple newlines with single
    text = re.sub(r' +', ' ', text)   # Replace multiple spaces with single
    text = text.strip()               # Remove leading/trailing whitespace
    return text

def normalize_filename(text):
    """Cleans text to be suitable for a filename."""
    text = re.sub(r'[^\w\-_\. ]', '', text) # Remove invalid characters
    text = re.sub(r'[ ]+', '_', text)    # Replace spaces with underscores
    text = text.strip('_')
    return text[:100] # Limit filename length to avoid OS issues

# --- Main Scraping Logic ---

def scrape_tds_course_content():
    """
    Main function to orchestrate the scraping of course content based on the provided screenshot,
    ignoring discussion threads.
    """
    print(f"Starting course content scraping from {BASE_URL}")
    index_page_html = get_page_content(BASE_URL)

    if not index_page_html:
        print("Could not retrieve the main course page. Exiting.")
        return

    soup = BeautifulSoup(index_page_html, 'html.parser')

    # --- 1. Scrape the main overview text ---
    title_element = soup.find('h2', string='Tools in Data Science - Jan 2025')
    
    overview_text_content = ""
    if title_element:
        current_element = title_element
        # Iterate through siblings until we hit a clear end to the overview (e.g., the module list)
        while current_element and not (current_element.name == 'ul' or current_element.find('li')):
            # Stop if we encounter a div that clearly marks the start of module list
            # (e.g., if there's a div wrapping the module list, inspect its class/id)
            if current_element.name == 'div' and ('main-content' in current_element.get('class', []) or 'course-sections' in current_element.get('class', [])):
                break # Break if we hit a container for the modules
            
            # Append text from p, div, etc.
            if current_element.name in ['p', 'div', 'h3', 'span']: # Add other relevant tags
                 overview_text_content += current_element.get_text(separator='\n').strip() + '\n'
            
            # Special handling for elements like <ul> that might contain list items
            # (e.g., "This course exposes you to real-life tools" items)
            if current_element.name == 'ul':
                for li in current_element.find_all('li'):
                    overview_text_content += "- " + li.get_text().strip() + '\n'

            current_element = current_element.find_next_sibling()
        
        # Clean up the collected overview text
        overview_text_content = re.sub(r'\n+', '\n', overview_text_content).strip()
        save_content("course_overview.txt", overview_text_content)
    else:
        print("Warning: Could not find 'Tools in Data Science - Jan 2025' title. Scraping all text from main page.")
        # Fallback: Scrape all clean text from the main page if specific title not found
        cleaned_main_page_text = clean_html_text(index_page_html)
        if cleaned_main_page_text:
            save_content("course_overview_fallback.txt", cleaned_main_page_text)


    # --- 2. Identify Module and Project Links ---
    module_links_found = []
    
    # Target the main content block that holds module links based on the screenshot
    # The links "1. Development Tools", "2. Deployment Tools", etc., are likely within a specific div.
    # We'll try to find any <a> tag whose text starts with a number and a dot,
    # or contains "Project" followed by a number.

    # A more robust way would be to find the direct parent of these numbered lists/links.
    # E.g., if they are in a div with id="course-modules-list"
    # main_modules_container = soup.find('div', id='course-modules-list')
    
    # If no specific container, we can search the entire body for links matching the pattern.
    # The screenshot shows the module links are regular text followed by a link, not nested deeply.
    
    # Look for links that represent the actual module/project pages
    # We'll refine this by looking for specific text patterns
    all_potential_links_on_page = soup.find_all('a', href=True)
    
    for link in all_potential_links_on_page:
        href = link.get('href')
        text = link.get_text().strip()
        
        if not href or href.startswith(('#', 'javascript:')):
            continue
        
        # Exclude "Discussion Thread" links explicitly
        if "discussion thread" in text.lower():
            continue
            
        # Construct absolute URL
        if not href.startswith(('http://', 'https://')):
            abs_url = requests.compat.urljoin(BASE_URL, href)
        else:
            abs_url = href
        
        # Filter for module/project links: starts with "Number." or contains "Project Number"
        # Example: "1. Development Tools", "Project 1"
        if re.match(r'^\d+\.\s', text) or re.match(r'^Project\s\d+', text, re.IGNORECASE):
            # Check if the URL is relevant (e.g., stays within the course domain)
            if abs_url.startswith(BASE_URL.split('/')[0] + '//' + BASE_URL.split('/')[2]):
                module_links_found.append({'url': abs_url, 'text': text})
            else:
                print(f"Skipping external link for module: {text} -> {abs_url}")
        
    # Remove duplicates if any link is found multiple times
    unique_module_links = []
    processed_urls_for_modules = set()
    for link_info in module_links_found:
        if link_info['url'] not in processed_urls_for_modules:
            unique_module_links.append(link_info)
            processed_urls_for_modules.add(link_info['url'])

    print(f"Found {len(unique_module_links)} unique module/project content links.")

    # --- 3. Visit Each Module Page and Scrape Content ---
    for i, link_info in enumerate(unique_module_links):
        url = link_info['url']
        original_link_text = link_info['text']
        filename_prefix = normalize_filename(original_link_text)
        
        print(f"\nProcessing content from module: {original_link_text} ({url})")

        module_page_html = get_page_content(url)
        if module_page_html:
            cleaned_text = clean_html_text(module_page_html)
            if cleaned_text:
                filename = f"{filename_prefix}.txt"
                save_content(filename, cleaned_text)
            else:
                print(f"Warning: No significant text extracted from {url}")
        else:
            print(f"Failed to retrieve content for module: {original_link_text} ({url})")

    print("\nCourse content scraping finished.")

if __name__ == "__main__":
    # Ensure output directories exist
    os.makedirs(COURSE_CONTENT_OUTPUT_DIR, exist_ok=True)
    
    print("--- Starting IIT Madras TDS Course Content Scraping (HTML Only) ---")
    print("WARNING: This script is a template. You MUST customize `BASE_URL`.")
    print("Also, you might need to adjust the CSS selectors/HTML parsing logic ")
    print("if the actual page structure differs from the screenshot.")
    print("Ensure you have permission to scrape the website.")
    
    # You might need to add authentication logic here if the course content is behind a login.
    # (e.g., using requests.Session() and handling a login form POST request)
    
    scrape_tds_course_content()
    print("--- Scraping script execution complete ---")

--- Starting IIT Madras TDS Course Content Scraping (HTML Only) ---
Also, you might need to adjust the CSS selectors/HTML parsing logic 
if the actual page structure differs from the screenshot.
Ensure you have permission to scrape the website.
Starting course content scraping from https://tds.s-anand.net/#/2025-01/
Fetching: https://tds.s-anand.net/#/2025-01/
Saved: scraped_tds_course_content\course_overview_fallback.txt
Found 0 unique module/project content links.

Course content scraping finished.
--- Scraping script execution complete ---


In [2]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import warnings

# Suppress the UserWarning from BeautifulSoup if no parser is explicitly specified
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# --- Configuration ---
# BASE_URL: The main URL shown in the screenshot for "Tools in Data Science - Jan 2025"
# Replace with the actual URL of your IIT Madras TDS course content page.
# For example: "https://online.iitm.ac.in/courses/tds/jan2025" (This is a placeholder)
BASE_URL = "https://tds.s-anand.net/#/2025-01/"

# Output directory to save scraped course content
COURSE_CONTENT_OUTPUT_DIR = "scraped_tds_course_content"

# Delay between requests (to be polite and avoid being blocked)
REQUEST_DELAY = 1.5 # seconds

# --- Helper Functions ---

def get_page_content(url):
    """Fetches the content of a given URL."""
    try:
        print(f"Fetching: {url}")
        response = requests.get(url, timeout=15) # Increased timeout
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        time.sleep(REQUEST_DELAY) # Be polite
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def save_content(filename, content, directory=COURSE_CONTENT_OUTPUT_DIR):
    """Saves content to a file in the specified directory."""
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, filename)
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Saved: {filepath}")
        return True
    except IOError as e:
        print(f"Error saving {filepath}: {e}")
        return False

def clean_html_text(html_element):
    """
    Removes common HTML tags and cleans up text from a given BeautifulSoup element,
    focusing on main content.
    """
    # Create a copy to avoid modifying the original soup object if needed elsewhere
    temp_soup = BeautifulSoup(str(html_element), 'html.parser')

    # Remove script, style, and common navigation/layout elements
    for element in temp_soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'img', 'svg', 'a']): # Exclude 'a' for overview only
        element.decompose()

    text = temp_soup.get_text(separator='\n')

    # Remove excessive whitespace, newlines, and tabs
    text = re.sub(r'\n+', '\n', text) # Replace multiple newlines with single
    text = re.sub(r' +', ' ', text)   # Replace multiple spaces with single
    text = text.strip()               # Remove leading/trailing whitespace
    return text

def normalize_filename(text):
    """Cleans text to be suitable for a filename."""
    text = re.sub(r'[^\w\-_\. ]', '', text) # Remove invalid characters
    text = re.sub(r'[ ]+', '_', text)    # Replace spaces with underscores
    text = text.strip('_')
    return text[:100] # Limit filename length to avoid OS issues

# --- Main Scraping Logic ---

def scrape_tds_course_content():
    """
    Main function to orchestrate the scraping of course content based on the provided screenshot,
    ignoring discussion threads.
    """
    print(f"Starting course content scraping from {BASE_URL}")
    index_page_html = get_page_content(BASE_URL)

    if not index_page_html:
        print("Could not retrieve the main course page. Exiting.")
        return

    soup = BeautifulSoup(index_page_html, 'html.parser')

    # --- 1. Scrape the main overview text ---
    # Attempt to find a main content area, or default to body.
    # The screenshot suggests the overview text is in a main content block, possibly not deeply nested.
    
    # Try to find a div or section that seems to contain the main content.
    # This is a common pattern: <main>, <div id="content">, <div class="main-body">, etc.
    # You might need to inspect the HTML to find the correct container.
    main_content_container = soup.find('main') or soup.find('div', class_='main-content') or soup.find('div', id='content') or soup.find('body')
    
    if main_content_container:
        # Clone the container's soup to remove links without affecting link discovery later
        overview_soup = BeautifulSoup(str(main_content_container), 'html.parser')
        
        # Remove elements that are part of the module list structure
        # This is a heuristic. You might need to refine this by looking for specific module list containers.
        for module_link_container in overview_soup.find_all('a', href=True):
            text = module_link_container.get_text().strip()
            if re.match(r'^\d+\.\s', text) or re.match(r'^Project\s\d+', text, re.IGNORECASE) or "discussion thread" in text.lower():
                # If this link is part of the modules, remove its parent element or itself
                if module_link_container.parent: # Try to remove the list item or paragraph containing it
                    module_link_container.parent.decompose()
                else: # Fallback: just remove the link itself
                    module_link_container.decompose()
        
        # After removing module-related elements, get the remaining text as overview
        overview_text_content = clean_html_text(overview_soup) # Use the modified clean_html_text function here
        save_content("course_overview.txt", overview_text_content)
        print("Extracted overview text.")
    else:
        print("Warning: Could not find a suitable main content container. Overview text might be incomplete.")
        save_content("course_overview_fallback.txt", clean_html_text(soup)) # Fallback: clean entire page for overview

    # --- 2. Identify Module and Project Links ---
    module_links_found = []
    
    # We now search for links on the *original* soup object before modification for overview
    all_potential_links_on_page = soup.find_all('a', href=True)
    
    for link in all_potential_links_on_page:
        href = link.get('href')
        text = link.get_text().strip()
        
        if not href or href.startswith(('#', 'javascript:')):
            continue
        
        # Exclude "Discussion Thread" links explicitly
        if "discussion thread" in text.lower():
            continue
            
        # Construct absolute URL
        if not href.startswith(('http://', 'https://')):
            abs_url = requests.compat.urljoin(BASE_URL, href)
        else:
            abs_url = href
        
        # Filter for module/project links: starts with "Number." or contains "Project Number"
        if re.match(r'^\d+\.\s', text) or re.match(r'^Project\s\d+', text, re.IGNORECASE):
            # Check if the URL is relevant (e.g., stays within the course domain)
            if abs_url.startswith(BASE_URL.split('/')[0] + '//' + BASE_URL.split('/')[2]):
                module_links_found.append({'url': abs_url, 'text': text})
            else:
                print(f"Skipping external link for module: {text} -> {abs_url}")
        
    # Remove duplicates if any link is found multiple times
    unique_module_links = []
    processed_urls_for_modules = set()
    for link_info in module_links_found:
        if link_info['url'] not in processed_urls_for_modules:
            unique_module_links.append(link_info)
            processed_urls_for_modules.add(link_info['url'])

    print(f"Found {len(unique_module_links)} unique module/project content links.")

    # --- 3. Visit Each Module Page and Scrape Content ---
    for i, link_info in enumerate(unique_module_links):
        url = link_info['url']
        original_link_text = link_info['text']
        filename_prefix = normalize_filename(original_link_text)
        
        print(f"\nProcessing content from module: {original_link_text} ({url})")

        module_page_html = get_page_content(url)
        if module_page_html:
            cleaned_text = clean_html_text(BeautifulSoup(module_page_html, 'html.parser')) # Pass soup object to clean_html_text
            if cleaned_text:
                filename = f"{filename_prefix}.txt"
                save_content(filename, cleaned_text)
            else:
                print(f"Warning: No significant text extracted from {url}")
        else:
            print(f"Failed to retrieve content for module: {original_link_text} ({url})")

    print("\nCourse content scraping finished.")

if __name__ == "__main__":
    # Ensure output directories exist
    os.makedirs(COURSE_CONTENT_OUTPUT_DIR, exist_ok=True)
    
    print("--- Starting IIT Madras TDS Course Content Scraping (HTML Only) ---")
    print("WARNING: This script is a template. You MUST customize `BASE_URL`.")
    print("Also, you might need to adjust the CSS selectors/HTML parsing logic ")
    print("if the actual page structure differs from the screenshot.")
    print("Ensure you have permission to scrape the website.")
    
    # You might need to add authentication logic here if the course content is behind a login.
    # (e.g., using requests.Session() and handling a login form POST request)
    
    scrape_tds_course_content()
    print("--- Scraping script execution complete ---")

--- Starting IIT Madras TDS Course Content Scraping (HTML Only) ---
Also, you might need to adjust the CSS selectors/HTML parsing logic 
if the actual page structure differs from the screenshot.
Ensure you have permission to scrape the website.
Starting course content scraping from https://tds.s-anand.net/#/2025-01/
Fetching: https://tds.s-anand.net/#/2025-01/
Saved: scraped_tds_course_content\course_overview.txt
Extracted overview text.
Found 0 unique module/project content links.

Course content scraping finished.
--- Scraping script execution complete ---


In [3]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# --- Configuration ---
BASE_URL = "https://tds.s-anand.net/#/2025-01/" # <--- CONFIRM THIS!
COURSE_CONTENT_OUTPUT_DIR = "scraped_tds_course_content_debug" # Use a new debug directory
REQUEST_DELAY = 2 # seconds - increased slightly

# Add a User-Agent header to mimic a browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

# --- Helper Functions ---

def get_page_content(url):
    """Fetches the content of a given URL."""
    try:
        print(f"Attempting to fetch: {url}")
        response = requests.get(url, timeout=15, headers=HEADERS) # Pass headers
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        print(f"Successfully fetched {url}. Status Code: {response.status_code}")
        time.sleep(REQUEST_DELAY)
        
        # --- DEBUGGING STEP: Save raw HTML ---
        raw_html_filename = normalize_filename(f"raw__{url.replace('https://', '').replace('/', '_')[:50]}.html")
        raw_html_filepath = os.path.join(COURSE_CONTENT_OUTPUT_DIR, raw_html_filename)
        with open(raw_html_filepath, 'w', encoding='utf-8') as f:
            f.write(response.text)
        print(f"Saved raw HTML to: {raw_html_filepath}")
        # --- END DEBUGGING STEP ---
        
        return response.text
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching {url}: {e.response.status_code} - {e.response.reason}")
        print(f"Response body (if available): {e.response.text[:500]}...") # Print first 500 chars
        return None
    except requests.exceptions.RequestException as e:
        print(f"General Request Error fetching {url}: {e}")
        return None

# (Keep save_content, clean_html_text, normalize_filename functions as they were in the last script)
# ... [copy paste from previous script starting here] ...
def save_content(filename, content, directory=COURSE_CONTENT_OUTPUT_DIR):
    """Saves content to a file in the specified directory."""
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, filename)
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Saved: {filepath}")
        return True
    except IOError as e:
        print(f"Error saving {filepath}: {e}")
        return False

def clean_html_text(html_element):
    """
    Removes common HTML tags and cleans up text from a given BeautifulSoup element,
    focusing on main content.
    """
    temp_soup = BeautifulSoup(str(html_element), 'html.parser')

    # Remove script, style, and common navigation/layout elements
    for element in temp_soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'img', 'svg', 'a']): # Keep 'a' for module pages, remove for overview
        element.decompose()

    text = temp_soup.get_text(separator='\n')

    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    return text

def normalize_filename(text):
    """Cleans text to be suitable for a filename."""
    text = re.sub(r'[^\w\-_\. ]', '', text)
    text = re.sub(r'[ ]+', '_', text)
    text = text.strip('_')
    return text[:100]

# --- Main Scraping Logic (mostly same as before, with minor debug changes) ---

def scrape_tds_course_content():
    """
    Main function to orchestrate the scraping of course content based on the provided screenshot,
    ignoring discussion threads.
    """
    print(f"Starting course content scraping from {BASE_URL}")
    index_page_html = get_page_content(BASE_URL)

    if not index_page_html:
        print("Could not retrieve the main course page. Check the error messages above. Exiting.")
        return

    soup = BeautifulSoup(index_page_html, 'html.parser')

    # --- 1. Scrape the main overview text ---
    main_content_container = soup.find('main') or soup.find('div', class_='main-content') or soup.find('div', id='content') or soup.find('body')
    
    if main_content_container:
        overview_soup = BeautifulSoup(str(main_content_container), 'html.parser')
        
        # Specific cleaning for overview: remove all links
        for a_tag in overview_soup.find_all('a'):
            a_tag.decompose()

        # Try to remove elements that look like module list items from the overview_soup
        # This is the tricky part without direct HTML.
        # Let's try to remove elements that contain text matching module patterns.
        # This is a heuristic. You might need to inspect the HTML of the module list.
        for elem in overview_soup.find_all(lambda tag: tag.name in ['div', 'p', 'li'] and (re.match(r'^\d+\.\s', tag.get_text(strip=True)) or re.match(r'^Project\s\d+', tag.get_text(strip=True), re.IGNORECASE))):
            elem.decompose() # Remove the whole element if it matches a module pattern

        overview_text_content = clean_html_text(overview_soup) 
        if overview_text_content.strip(): # Only save if there's actual content
            save_content("course_overview.txt", overview_text_content)
            print("Extracted overview text.")
        else:
            print("Warning: Overview text extraction resulted in blank content. Adjusting selectors needed.")
            save_content("course_overview_blank_debug.txt", "Blank content after processing.")
    else:
        print("Warning: Could not find a suitable main content container for overview. Overview text might be incomplete.")
        save_content("course_overview_fallback.txt", clean_html_text(soup)) # Fallback: clean entire page for overview

    # --- 2. Identify Module and Project Links ---
    module_links_found = []
    
    all_potential_links_on_page = soup.find_all('a', href=True)
    
    for link in all_potential_links_on_page:
        href = link.get('href')
        text = link.get_text().strip()
        
        if not href or href.startswith(('#', 'javascript:')):
            continue
        
        if "discussion thread" in text.lower():
            continue
            
        if not href.startswith(('http://', 'https://')):
            abs_url = requests.compat.urljoin(BASE_URL, href)
        else:
            abs_url = href
        
        if re.match(r'^\d+\.\s', text) or re.match(r'^Project\s\d+', text, re.IGNORECASE):
            if abs_url.startswith(BASE_URL.split('/')[0] + '//' + BASE_URL.split('/')[2]):
                module_links_found.append({'url': abs_url, 'text': text})
            else:
                print(f"Skipping external link for module: {text} -> {abs_url}")
        
    unique_module_links = []
    processed_urls_for_modules = set()
    for link_info in module_links_found:
        if link_info['url'] not in processed_urls_for_modules:
            unique_module_links.append(link_info)
            processed_urls_for_modules.add(link_info['url'])

    print(f"Found {len(unique_module_links)} unique module/project content links.")

    # --- 3. Visit Each Module Page and Scrape Content ---
    for i, link_info in enumerate(unique_module_links):
        url = link_info['url']
        original_link_text = link_info['text']
        filename_prefix = normalize_filename(original_link_text)
        
        print(f"\nProcessing content from module: {original_link_text} ({url})")

        module_page_html = get_page_content(url)
        if module_page_html:
            # For module pages, don't remove <a> tags generally in clean_html_text,
            # as they might be part of the module content (e.g., links to resources)
            # Create a new BeautifulSoup object for the module page to clean it.
            module_soup = BeautifulSoup(module_page_html, 'html.parser')
            
            # For cleaning module pages, we might want a slightly different 'clean_html_text' logic
            # specifically for this context, or make clean_html_text configurable.
            # For now, let's just make sure it's not decomposing <a> tags globally.
            # (The clean_html_text function's 'a' tag removal was specifically for the overview,
            # so we ensure it's not applied here if it causes issues on module pages.)
            
            # Re-checking clean_html_text: it *does* decompose 'a'. This might be too aggressive for module content.
            # Let's create a *separate* cleaning function for module pages that *doesn't* remove links.
            def clean_module_page_text(html_content_for_module):
                soup_module = BeautifulSoup(html_content_for_module, 'html.parser')
                # Remove script, style, header, footer, nav, aside, form, img, svg - but KEEP 'a'
                for element_to_decompose in soup_module(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'img', 'svg']):
                    element_to_decompose.decompose()
                
                text = soup_module.get_text(separator='\n')
                text = re.sub(r'\n+', '\n', text)
                text = re.sub(r' +', ' ', text)
                text = text.strip()
                return text

            cleaned_text = clean_module_page_text(module_page_html) # Use specialized cleaner
            
            if cleaned_text.strip():
                filename = f"{filename_prefix}.txt"
                save_content(filename, cleaned_text)
            else:
                print(f"Warning: No significant text extracted from {url} after cleaning.")
                save_content(f"{filename_prefix}_blank_debug.txt", "Blank content after cleaning.")
        else:
            print(f"Failed to retrieve content for module: {original_link_text} ({url})")

    print("\nCourse content scraping finished.")

if __name__ == "__main__":
    os.makedirs(COURSE_CONTENT_OUTPUT_DIR, exist_ok=True)
    
    print("--- Starting IIT Madras TDS Course Content Scraping (HTML Only) ---")
    print("WARNING: This script is a template. You MUST customize `BASE_URL`.")
    print("If output is blank, check the 'raw_*.html' files in the debug directory.")
    print("Remember to handle authentication if the content is behind a login.")
    print("Ensure you have permission to scrape the website.")
    
    scrape_tds_course_content()
    print("--- Scraping script execution complete ---")

--- Starting IIT Madras TDS Course Content Scraping (HTML Only) ---
If output is blank, check the 'raw_*.html' files in the debug directory.
Remember to handle authentication if the content is behind a login.
Ensure you have permission to scrape the website.
Starting course content scraping from https://tds.s-anand.net/#/2025-01/
Attempting to fetch: https://tds.s-anand.net/#/2025-01/
Successfully fetched https://tds.s-anand.net/#/2025-01/. Status Code: 200
Saved raw HTML to: scraped_tds_course_content_debug\raw__tds.s-anand.net__2025-01_.html
Saved: scraped_tds_course_content_debug\course_overview_blank_debug.txt
Found 0 unique module/project content links.

Course content scraping finished.
--- Scraping script execution complete ---


In [6]:
import os
import time
import re
import warnings

# --- Selenium imports for Edge ---
from selenium import webdriver
from selenium.webdriver.edge.service import Service # Change from chrome.service to edge.service
from selenium.webdriver.edge.options import Options # Import Options specifically for Edge
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# --- Configuration ---
BASE_URL = "https://tds.s-anand.net/#/2025-01/" # <--- Your confirmed URL!

# Output directory to save scraped course content
COURSE_CONTENT_OUTPUT_DIR = "scraped_tds_course_content"

# Path to your MSEdgeDriver executable
# IMPORTANT: Replace with the actual path to your msedgedriver.exe
WEBDRIVER_PATH = r"C:\Users\sahil\Desktop\TA_Project\msedgedriver.exe"
# WEBDRIVER_PATH = "/usr/local/bin/msedgedriver" # For macOS/Linux (unlikely for Edge, but for completeness)

# Time to wait for elements to load (JavaScript rendering)
WAIT_TIME_SECONDS = 10 

# --- Helper Functions (Updated for Edge) ---

driver = None

def initialize_driver():
    global driver
    if driver is None:
        try:
            # Configure Edge options
            edge_options = Options() # Use Options for Edge
            # edge_options.add_argument("--headless")  # Run in headless mode (no UI)
            edge_options.add_argument("--disable-gpu")
            edge_options.add_argument("--no-sandbox")
            edge_options.add_argument("--disable-dev-shm-usage")
            edge_options.add_argument("window-size=1920x1080")
            
            service = Service(WEBDRIVER_PATH) # Use Service for Edge
            driver = webdriver.Edge(service=service, options=edge_options) # Use webdriver.Edge
            print("WebDriver (Microsoft Edge) initialized successfully.")
        except Exception as e:
            print(f"Error initializing WebDriver: {e}")
            print("Make sure msedgedriver is installed and its path is correct.")
            print("Also, ensure msedgedriver version matches your Edge browser version.")
            exit()

def get_page_content_selenium(url):
    """Fetches the content of a given URL using Selenium."""
    if driver is None:
        initialize_driver()
        
    try:
        print(f"Attempting to load page with Selenium: {url}")
        driver.get(url)
        
        # --- IMPORTANT: Wait for content to load ---
        WebDriverWait(driver, WAIT_TIME_SECONDS).until(
            EC.presence_of_element_located((By.XPATH, "//h2[contains(text(), 'Tools in Data Science - Jan 2025')]"))
        )
        print("Page content loaded (H2 header found).")
        
        time.sleep(1) # Small additional delay just to be safe after content loads
        
        return driver.page_source
    except Exception as e:
        print(f"Error loading page with Selenium {url}: {e}")
        driver.save_screenshot(os.path.join(COURSE_CONTENT_OUTPUT_DIR, "error_screenshot.png"))
        return None

def save_content(filename, content, directory=COURSE_CONTENT_OUTPUT_DIR):
    """Saves content to a file in the specified directory."""
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, filename)
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Saved: {filepath}")
        return True
    except IOError as e:
        print(f"Error saving {filepath}: {e}")
        return False

def clean_html_text(html_element, remove_links=False):
    """
    Removes common HTML tags and cleans up text from a given BeautifulSoup element.
    `remove_links` controls whether <a> tags are decomposed.
    """
    temp_soup = BeautifulSoup(str(html_element), 'html.parser')

    elements_to_decompose = ['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'img', 'svg']
    if remove_links:
        elements_to_decompose.append('a')

    for element in temp_soup(elements_to_decompose):
        element.decompose()

    text = temp_soup.get_text(separator='\n')

    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    return text

def normalize_filename(text):
    """Cleans text to be suitable for a filename."""
    text = re.sub(r'[^\w\-_\. ]', '', text)
    text = re.sub(r'[ ]+', '_', text)
    text = text.strip('_')
    return text[:100]

# --- Main Scraping Logic (same as previous Selenium version) ---

def scrape_tds_course_content():
    """
    Main function to orchestrate the scraping of course content using Selenium.
    """
    print(f"Starting course content scraping from {BASE_URL}")
    
    index_page_html = get_page_content_selenium(BASE_URL)

    if not index_page_html:
        print("Could not retrieve the main course page with Selenium. Exiting.")
        return

    soup = BeautifulSoup(index_page_html, 'html.parser')

    # --- 1. Scrape the main overview text ---
    # Find a main content container based on your HTML structure.
    # You might need to inspect the HTML for the actual class/id of this container.
    main_content_container = soup.find('main') or soup.find('div', class_='content-wrapper') or soup.find('div', id='root') or soup.find('body') # <-- Adjust this selector
    
    if main_content_container:
        overview_soup = BeautifulSoup(str(main_content_container), 'html.parser')
        
        for link_elem in overview_soup.find_all('a', href=True):
            text = link_elem.get_text(strip=True)
            if re.match(r'^\d+\.\s', text) or re.match(r'^Project\s\d+', text, re.IGNORECASE) or "discussion thread" in text.lower():
                if link_elem.parent and link_elem.parent.name in ['li', 'div', 'p']:
                    link_elem.parent.decompose()
                else:
                    link_elem.decompose()

        overview_text_content = clean_html_text(overview_soup, remove_links=True) 
        if overview_text_content.strip():
            save_content("course_overview.txt", overview_text_content)
            print("Extracted overview text.")
        else:
            print("Warning: Overview text extraction resulted in blank content after cleaning. Adjusting selectors needed.")
            save_content("course_overview_blank_debug.txt", "Blank content after processing (overview).")
    else:
        print("Warning: Could not find a suitable main content container for overview. Fallback to cleaning entire page.")
        save_content("course_overview_fallback.txt", clean_html_text(soup, remove_links=True)) 

    # --- 2. Identify Module and Project Links ---
    module_links_found = []
    
    potential_links = soup.find_all('a', href=True)
    
    for link in potential_links:
        href = link.get('href')
        text = link.get_text().strip()
        
        if not href or href.startswith(('#', 'javascript:')):
            continue
        
        if "discussion thread" in text.lower() or "search" in text.lower() or "login" in text.lower():
            continue
            
        if not href.startswith(('http://', 'https://')):
            abs_url = requests.compat.urljoin(BASE_URL, href)
        else:
            abs_url = href
        
        if re.match(r'^\d+\.\s', text) or re.match(r'^Project\s\d+', text, re.IGNORECASE):
            if abs_url.startswith(BASE_URL.split('/')[0] + '//' + BASE_URL.split('/')[2]):
                module_links_found.append({'url': abs_url, 'text': text})
            else:
                print(f"Skipping external link for module: {text} -> {abs_url}")
        
    unique_module_links = []
    processed_urls_for_modules = set()
    for link_info in module_links_found:
        if link_info['url'] not in processed_urls_for_modules:
            unique_module_links.append(link_info)
            processed_urls_for_modules.add(link_info['url'])

    print(f"Found {len(unique_module_links)} unique module/project content links.")

    # --- 3. Visit Each Module Page and Scrape Content ---
    for i, link_info in enumerate(unique_module_links):
        url = link_info['url']
        original_link_text = link_info['text']
        filename_prefix = normalize_filename(original_link_text)
        
        print(f"\nProcessing content from module: {original_link_text} ({url})")

        module_page_html = get_page_content_selenium(url)
        if module_page_html:
            module_soup = BeautifulSoup(module_page_html, 'html.parser')
            cleaned_text = clean_html_text(module_soup, remove_links=False) 
            
            if cleaned_text.strip():
                filename = f"{filename_prefix}.txt"
                save_content(filename, cleaned_text)
            else:
                print(f"Warning: No significant text extracted from {url} after cleaning. Saved blank debug file.")
                save_content(f"{filename_prefix}_blank_debug.txt", "Blank content after processing (module).")
        else:
            print(f"Failed to retrieve content for module: {original_link_text} ({url})")

    # --- Cleanup ---
    if driver:
        driver.quit()
        print("WebDriver closed.")
    print("\nCourse content scraping finished.")

if __name__ == "__main__":
    os.makedirs(COURSE_CONTENT_OUTPUT_DIR, exist_ok=True)
    
    print("--- Starting IIT Madras TDS Course Content Scraping (Selenium - Microsoft Edge) ---")
    print("WARNING: This script now uses Selenium with MSEdgeDriver. Ensure it's installed.")
    print(f"Ensure WEBDRIVER_PATH is correct: '{WEBDRIVER_PATH}'")
    print("Remember to handle authentication if the content is behind a login.")
    print("Ensure you have permission to scrape the website.")
    
    scrape_tds_course_content()
    print("--- Scraping script execution complete ---")

--- Starting IIT Madras TDS Course Content Scraping (Selenium - Microsoft Edge) ---
Ensure WEBDRIVER_PATH is correct: 'C:\Users\sahil\Desktop\TA_Project\msedgedriver.exe'
Remember to handle authentication if the content is behind a login.
Ensure you have permission to scrape the website.
Starting course content scraping from https://tds.s-anand.net/#/2025-01/
WebDriver (Microsoft Edge) initialized successfully.
Attempting to load page with Selenium: https://tds.s-anand.net/#/2025-01/
Error loading page with Selenium https://tds.s-anand.net/#/2025-01/: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff7ce293865+25605]
	(No symbol) [0x0x7ff7ce1e3970]
	Microsoft::Applications::Events::EventProperty::to_string [0x0x7ff7ce516e7a+1962506]
	(No symbol) [0x0x7ff7cdff86c4]
	(No symbol) [0x0x7ff7cdff898b]
	(No symbol) [0x0x7ff7ce039af7]
	(No symbol) [0x0x7ff7ce019e8f]
	(No symbol) [0x0x7ff7cdfeea5d]
	(No symbol) [0x0x7ff7ce0375df]
	(No symbol) [0x0x7ff7ce019bb3]
	(No symbol) [0x0x7ff7cdfedf56]
	(

In [13]:
import os
import time
import re
import warnings

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# --- Configuration ---
BASE_URL = "https://tds.s-anand.net/#/README?id=tools-in-data-science-may-2025" # Your confirmed URL!
COURSE_CONTENT_OUTPUT_DIR = "scraped_tds_course_content"
WEBDRIVER_PATH = r"C:\Users\sahil\Desktop\TA_Project\msedgedriver.exe" # Using raw string for path
WAIT_TIME_SECONDS = 15 # Increased wait time

driver = None

def initialize_driver():
    global driver
    if driver is None:
        try:
            edge_options = Options()
            # UNCOMMENT THE LINE BELOW TO SEE THE BROWSER WINDOW FOR DEBUGGING
            # edge_options.add_argument("--headless")
            edge_options.add_argument("--disable-gpu")
            edge_options.add_argument("--no-sandbox")
            edge_options.add_argument("--disable-dev-shm-usage")
            edge_options.add_argument("window-size=1920x1080")
            
            service = Service(WEBDRIVER_PATH)
            driver = webdriver.Edge(service=service, options=edge_options)
            print("WebDriver (Microsoft Edge) initialized successfully.")
        except Exception as e:
            print(f"Error initializing WebDriver: {e}")
            print(f"Make sure msedgedriver is installed, its path '{WEBDRIVER_PATH}' is correct, and its version matches your Edge browser version.")
            exit()

def get_page_content_selenium(url):
    """Fetches the content of a given URL using Selenium."""
    if driver is None:
        initialize_driver()
        
    try:
        print(f"Attempting to load page with Selenium: {url}")
        driver.get(url)
        
        # --- IMPORTANT: Wait for a reliable element to load ---
        # The H2 might not be immediately visible. Let's try to wait for a more generic element
        # that indicates the *page has fully loaded its basic structure* from JS.
        # Given the screenshot, the left sidebar element with "Tools in Data Science"
        # or the "Tools in Data Science - Jan 2025" header itself should be reliable targets.
        
        # Waiting for the main H2 title to be present and visible
        WebDriverWait(driver, WAIT_TIME_SECONDS).until(
            EC.visibility_of_element_located((By.XPATH, "//h2[contains(text(), 'Tools in Data Science - Jan 2025')]"))
        )
        print("Page content loaded (H2 header found and visible).")
        
        # You might also add a small fixed delay after the elements are expected to load
        # to ensure all dynamic content has settled.
        time.sleep(2) 
        
        return driver.page_source
    except Exception as e:
        print(f"Error loading page with Selenium {url}: {e}")
        # Save a screenshot *if* an error occurs, to see the state of the browser
        try:
            driver.save_screenshot(os.path.join(COURSE_CONTENT_OUTPUT_DIR, "error_screenshot_after_load_attempt.png"))
            print(f"Saved error screenshot to {os.path.join(COURSE_CONTENT_OUTPUT_DIR, 'error_screenshot_after_load_attempt.png')}")
        except Exception as ss_e:
            print(f"Could not save screenshot: {ss_e}")
        return None

def save_content(filename, content, directory=COURSE_CONTENT_OUTPUT_DIR):
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, filename)
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Saved: {filepath}")
        return True
    except IOError as e:
        print(f"Error saving {filepath}: {e}")
        return False

def clean_html_text(html_element, remove_links=False):
    temp_soup = BeautifulSoup(str(html_element), 'html.parser')
    elements_to_decompose = ['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'img', 'svg']
    if remove_links:
        elements_to_decompose.append('a')
    for element in temp_soup(elements_to_decompose):
        element.decompose()
    text = temp_soup.get_text(separator='\n')
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    return text

def normalize_filename(text):
    text = re.sub(r'[^\w\-_\. ]', '', text)
    text = re.sub(r'[ ]+', '_', text)
    text = text.strip('_')
    return text[:100]

# --- Main Scraping Logic ---

def scrape_tds_course_content():
    print(f"Starting course content scraping from {BASE_URL}")
    
    index_page_html = get_page_content_selenium(BASE_URL)

    if not index_page_html:
        print("Could not retrieve the main course page with Selenium. Exiting.")
        return

    soup = BeautifulSoup(index_page_html, 'html.parser')

    # --- 1. Scrape the main overview text ---
    # Attempt to find a main content container based on your HTML structure.
    # From the screenshot, it looks like the content is within a main area.
    # Common guesses for SPAs: <div id="app">, <div id="root">, <main>, <div class="main-content">
    # Try using the `div` with class `sc-iomgmy` or similar if inspecting the live page.
    main_content_container = soup.find('main') or soup.find('div', class_='main-content') or soup.find('div', id='root') or soup.find('body') 
    
    if main_content_container:
        overview_soup = BeautifulSoup(str(main_content_container), 'html.parser')
        
        # Remove elements that are part of the module list structure from the overview_soup
        # This is a heuristic. You might need to refine this by looking for specific module list containers.
        for link_elem in overview_soup.find_all('a', href=True):
            text = link_elem.get_text(strip=True)
            if re.match(r'^\d+\.\s', text) or re.match(r'^Project\s\d+', text, re.IGNORECASE) or "discussion thread" in text.lower():
                # Try to remove the entire block containing the link
                # Look for a common parent that groups the number, title, and discussion link
                parent_to_remove = link_elem.find_parent(['li', 'div', 'p']) # Common parent types for list items/blocks
                if parent_to_remove:
                    parent_to_remove.decompose()
                else:
                    link_elem.decompose() # Fallback if no suitable parent found
        
        overview_text_content = clean_html_text(overview_soup, remove_links=True) 
        if overview_text_content.strip():
            save_content("course_overview.txt", overview_text_content)
            print("Extracted overview text.")
        else:
            print("Warning: Overview text extraction resulted in blank content after cleaning. Adjusting selectors needed.")
            save_content("course_overview_blank_debug.txt", "Blank content after processing (overview).")
    else:
        print("Warning: Could not find a suitable main content container for overview. Fallback to cleaning entire page.")
        save_content("course_overview_fallback.txt", clean_html_text(soup, remove_links=True)) 

    # --- 2. Identify Module and Project Links ---
    module_links_found = []
    
    # Target elements that contain both the module number/title and the link itself.
    # Based on your screenshots, these look like <p> tags or similar blocks.
    
    # Try to find all <a> tags that match the pattern, anywhere on the page
    # You might also look for parent elements of these links if they are consistently structured.
    # For example, if they are all inside a <div class="course-modules">
    # content_area = soup.find('div', class_='course-modules')
    # if content_area:
    #     potential_links = content_area.find_all('a', href=True)
    # else:
    #     potential_links = soup.find_all('a', href=True) # Fallback
    
    potential_links = soup.find_all('a', href=True) # Start broad if not sure about parent container
    
    for link in potential_links:
        href = link.get('href')
        text = link.get_text().strip()
        
        if not href or href.startswith(('#', 'javascript:')):
            continue
        
        if "discussion thread" in text.lower() or "search" in text.lower() or "login" in text.lower():
            continue
            
        if not href.startswith(('http://', 'https://')):
            abs_url = requests.compat.urljoin(BASE_URL, href)
        else:
            abs_url = href
        
        if re.match(r'^\d+\.\s', text) or re.match(r'^Project\s\d+', text, re.IGNORECASE):
            if abs_url.startswith(BASE_URL.split('/')[0] + '//' + BASE_URL.split('/')[2]):
                module_links_found.append({'url': abs_url, 'text': text})
            else:
                print(f"Skipping external link for module: {text} -> {abs_url}")
        
    unique_module_links = []
    processed_urls_for_modules = set()
    for link_info in module_links_found:
        if link_info['url'] not in processed_urls_for_modules:
            unique_module_links.append(link_info)
            processed_urls_for_modules.add(link_info['url'])

    print(f"Found {len(unique_module_links)} unique module/project content links.")

    # --- 3. Visit Each Module Page and Scrape Content ---
    for i, link_info in enumerate(unique_module_links):
        url = link_info['url']
        original_link_text = link_info['text']
        filename_prefix = normalize_filename(original_link_text)
        
        print(f"\nProcessing content from module: {original_link_text} ({url})")

        module_page_html = get_page_content_selenium(url)
        if module_page_html:
            module_soup = BeautifulSoup(module_page_html, 'html.parser')
            # For module pages, we generally *do not* want to remove links as they might be relevant content.
            cleaned_text = clean_html_text(module_soup, remove_links=False) 
            
            if cleaned_text.strip():
                filename = f"{filename_prefix}.txt"
                save_content(filename, cleaned_text)
            else:
                print(f"Warning: No significant text extracted from {url} after cleaning. Saved blank debug file.")
                save_content(f"{filename_prefix}_blank_debug.txt", "Blank content after processing (module).")
        else:
            print(f"Failed to retrieve content for module: {original_link_text} ({url})")

    # --- Cleanup ---
    if driver:
        driver.quit()
        print("WebDriver closed.")
    print("\nCourse content scraping finished.")

if __name__ == "__main__":
    os.makedirs(COURSE_CONTENT_OUTPUT_DIR, exist_ok=True)
    
    print("--- Starting IIT Madras TDS Course Content Scraping (Selenium - Microsoft Edge) ---")
    print("WARNING: This script now uses Selenium with MSEdgeDriver. Ensure it's installed.")
    print(f"Ensure WEBDRIVER_PATH is correct: '{WEBDRIVER_PATH}'")
    print("If browser window does not appear or shows errors, ensure MSEdgeDriver version matches your Edge browser version exactly.")
    print("Remember to handle authentication if the content is behind a login.")
    print("Ensure you have permission to scrape the website.")
    
    scrape_tds_course_content()
    print("--- Scraping script execution complete ---")

--- Starting IIT Madras TDS Course Content Scraping (Selenium - Microsoft Edge) ---
Ensure WEBDRIVER_PATH is correct: 'C:\Users\sahil\Desktop\TA_Project\msedgedriver.exe'
If browser window does not appear or shows errors, ensure MSEdgeDriver version matches your Edge browser version exactly.
Remember to handle authentication if the content is behind a login.
Ensure you have permission to scrape the website.
Starting course content scraping from https://tds.s-anand.net/#/README?id=tools-in-data-science-may-2025
WebDriver (Microsoft Edge) initialized successfully.
Attempting to load page with Selenium: https://tds.s-anand.net/#/README?id=tools-in-data-science-may-2025
Error loading page with Selenium https://tds.s-anand.net/#/README?id=tools-in-data-science-may-2025: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff7ce293865+25605]
	(No symbol) [0x0x7ff7ce1e3970]
	Microsoft::Applications::Events::EventProperty::to_string [0x0x7ff7ce516e7a+1962506]
	(No symbol) [0x0x7ff7cdff86c4]
	(No sym

In [8]:
driver.get("https://www.google.com")

In [9]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Ensure this path is correct for your msedgedriver.exe
WEBDRIVER_PATH = r'C:\Users\sahil\Desktop\TA_Project\msedgedriver.exe'

service = Service(WEBDRIVER_PATH)
driver = webdriver.Edge(service=service)

# Set an implicit wait time (e.g., 10 seconds)
driver.implicitly_wait(10) # Wait up to 10 seconds for elements to appear

try:
    url = "https://tds.s-anand.net/#/2025-01/"
    print(f"Attempting to load page: {url}")
    driver.get(url)
    print("Page loaded successfully (or so it seems).")

    # Add a short sleep just to visually confirm the page is open
    time.sleep(5)

    # Maximize the window (can sometimes help with dynamic content rendering)
    driver.maximize_window()

    # --- IMPORTANT: Add an Explicit Wait for a specific element ---
    # Look at your screenshot. What's a unique element that appears after the page is fully loaded?
    # For example, the "Tools in Data Science - Jan 2025" heading.
    # Let's try to wait for that h1 element.

    print("Waiting for the main heading to be present...")
    main_heading_xpath = "//h1[contains(text(), 'Tools in Data Science - Jan 2025')]"
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, main_heading_xpath))
    )
    print("Main heading found! Page content seems to be fully loaded.")

    # Now, you can try to interact with other elements, e.g., click a link
    # Example: Try to click the "Development Tools" link in the left sidebar
    # Inspect the element on the actual page to get its correct XPath or CSS selector
    try:
        dev_tools_link_xpath = "//a[contains(@href, '#/2025-01/development-tools')]"
        dev_tools_element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, dev_tools_link_xpath))
        )
        print("Clicking 'Development Tools' link...")
        dev_tools_element.click()
        print("Clicked 'Development Tools'. Waiting for content to change...")
        time.sleep(5) # Give it time to load the new content

        # You might want to assert that the URL changed or new content is present
        print(f"Current URL after click: {driver.current_url}")

    except Exception as e:
        print(f"Could not click 'Development Tools' link: {e}")

except Exception as e:
    print(f"Error loading page or finding elements: {e}")
    # Save screenshot if an error occurs during the main process
    driver.save_screenshot("error_after_waits.png")

finally:
    print("Closing browser...")
    driver.quit()

Attempting to load page: https://tds.s-anand.net/#/2025-01/
Page loaded successfully (or so it seems).
Waiting for the main heading to be present...
Error loading page or finding elements: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff7ce293865+25605]
	(No symbol) [0x0x7ff7ce1e3970]
	Microsoft::Applications::Events::EventProperty::to_string [0x0x7ff7ce516e7a+1962506]
	(No symbol) [0x0x7ff7cdff86c4]
	(No symbol) [0x0x7ff7cdff898b]
	(No symbol) [0x0x7ff7ce039af7]
	(No symbol) [0x0x7ff7ce019e8f]
	(No symbol) [0x0x7ff7cdfeea5d]
	(No symbol) [0x0x7ff7ce0375df]
	(No symbol) [0x0x7ff7ce019bb3]
	(No symbol) [0x0x7ff7cdfedf56]
	(No symbol) [0x0x7ff7cdfed463]
	(No symbol) [0x0x7ff7cdfedd83]
	(No symbol) [0x0x7ff7ce0ee10d]
	(No symbol) [0x0x7ff7ce0fdce8]
	Microsoft::Applications::Events::EventProperty::to_string [0x0x7ff7ce378839+265161]
	Microsoft::Applications::Events::EventProperty::to_string [0x0x7ff7ce380111+296097]
	(No symbol) [0x0x7ff7ce1f2111]
	(No symbol) [0x0x7ff7ce1ea5b4]
	(No symb

In [11]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os

WEBDRIVER_PATH = r'C:\Users\sahil\Desktop\TA_Project\msedgedriver.exe' # Use raw string for path
OUTPUT_DIR = 'scraped_tds_course_content'

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

print("--- Starting IIT Madras TDS Course Content Scraping (Selenium - Microsoft Edge) ---")
print("WARNING: This script now uses Selenium with MSEdgeDriver. Ensure it's installed.")
print(f"Ensure WEBDRIVER_PATH is correct: '{WEBDRIVER_PATH}'")
print("If browser window does not appear or shows errors, ensure MSEdgeDriver version matches your Edge browser version exactly.")
print("Remember to handle authentication if the content is behind a login.")
print("Ensure you have permission to scrape the website.")

options = Options()
options.use_chromium = True
# Crucial: Set page load strategy to 'eager'
options.page_load_strategy = 'eager'
# options.add_argument("--headless") # Uncomment for headless mode after testing

driver = None
try:
    service = Service(WEBDRIVER_PATH)
    driver = webdriver.Edge(service=service, options=options)
    print("WebDriver (Microsoft Edge) initialized successfully.")

    url = "https://tds.s-anand.net/#/2025-01/"
    print(f"Attempting to load page: {url}")
    driver.get(url)
    print("Page loaded successfully (or so it seems).")

    # Give it a small explicit pause before trying to wait for elements
    time.sleep(2) # Added a small delay to let JavaScript execute

    print("Waiting for the main heading to be present...")
    # Use a more robust selector for the main heading.
    # Based on the screenshot, it looks like an h1 or h2.
    # You might need to inspect the element in your browser's dev tools (F12)
    # to get the exact tag name, class, or id.

    # Example using CSS_SELECTOR (adjust as needed after inspecting the element)
    main_heading_selector = "h1.q-page__title" # This is a common pattern for Quasar framework titles
    # If it's just an h1 or h2 without a specific class, use:
    # main_heading_selector = "h1" 
    # Or if it has an ID:
    # main_heading_selector = "#your_heading_id"

    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, main_heading_selector))
    )
    print("Main heading found!")

    # Now that the main heading is confirmed, you can proceed with other interactions
    # For example, finding all links in the main content area
    # This part of your script would go here:
    # --------------------------------------------------------------------------
    # Find all link elements. You'll need to refine this based on the actual HTML structure
    # links = driver.find_elements(By.TAG_NAME, 'a')
    # for link in links:
    #     href = link.get_attribute('href')
    #     text = link.text
    #     if href and "Discussion Thread" not in text: # Filter out discussion threads if needed
    #         print(f"Found link: {text} -> {href}")
    # --------------------------------------------------------------------------

except Exception as e:
    print(f"Error loading page or finding elements: {e}")
    error_screenshot_path = os.path.join(OUTPUT_DIR, "error_screenshot_after_waits.png")
    driver.save_screenshot(error_screenshot_path)
    print(f"Saved error screenshot to {error_screenshot_path}")
finally:
    if driver:
        driver.quit()
        print("Closing browser...")
    print("--- Scraping script execution complete ---")

--- Starting IIT Madras TDS Course Content Scraping (Selenium - Microsoft Edge) ---
Ensure WEBDRIVER_PATH is correct: 'C:\Users\sahil\Desktop\TA_Project\msedgedriver.exe'
If browser window does not appear or shows errors, ensure MSEdgeDriver version matches your Edge browser version exactly.
Remember to handle authentication if the content is behind a login.
Ensure you have permission to scrape the website.
WebDriver (Microsoft Edge) initialized successfully.
Attempting to load page: https://tds.s-anand.net/#/2025-01/
Page loaded successfully (or so it seems).
Waiting for the main heading to be present...
Error loading page or finding elements: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff7ce293865+25605]
	(No symbol) [0x0x7ff7ce1e3970]
	Microsoft::Applications::Events::EventProperty::to_string [0x0x7ff7ce516e7a+1962506]
	(No symbol) [0x0x7ff7cdff86c4]
	(No symbol) [0x0x7ff7cdff898b]
	(No symbol) [0x0x7ff7ce039af7]
	(No symbol) [0x0x7ff7ce019e8f]
	(No symbol) [0x0x7ff7cdfeea5d]
	(N