In [29]:
# importing files :
import logging 
import os
import json
import re
from datetime import datetime
from urllib.parse import urljoin
from markdownify import markdownify as md
from playwright.async_api import async_playwright
import asyncio


In [30]:
BASE_URL = "https://tds.s-anand.net/#/2025-01/"
BASE_ORIGIN = "https://tds.s-anand.net"
OUTPUT_DIR = "scraped_data_course_content"
METADATA_FILE = "metadata.json"

In [31]:
# Configure logging

logging_dir="logs"
os.makedirs(logging_dir, exist_ok=True)
logging.basicConfig(filename=os.path.join(logging_dir,"scraped_course_content.log"), level=logging.DEBUG, format='%(asctime)s %(message)s', force=True)

In [32]:

visited = set()
metadata = []

In [33]:
# Function to format the file name
def format_filename(title):
    return re.sub(r'[\\/*?:"<>|]', "_", title).strip().replace(" ", "_")


In [34]:
# function to extract all internal links 
async def extract_internal_links(page):
    links = await page.eval_on_selector_all("a[href]", "els => els.map(el => el.href)")
    return list(set(
        link for link in links
        if BASE_ORIGIN in link and '/#/' in link
    ))

In [35]:
# Function to scrape a page
async def wait_for_article_and_get_html(page):
    await page.wait_for_selector("article.markdown-section#main", timeout=10000)
    return await page.inner_html("article.markdown-section#main")


In [36]:
# Goto the base URL and start crawling
async def crawl_page(page, url):
    if url in visited:
        return
    visited.add(url)

    logging.info(f"📄 Visiting: {url}")
    try:
        await page.goto(url, wait_until="domcontentloaded")
        await page.wait_for_timeout(1000)
        html = await wait_for_article_and_get_html(page)
    except Exception as e:
        logging.error(f"❌ Error loading page: {url}\n{e}")
        return

    # Extract title and save markdown
    title = (await page.title()).split(" - ")[0].strip() or f"page_{len(visited)}"
    filename = format_filename(title)
    filepath = os.path.join(OUTPUT_DIR, f"{filename}.md")

    markdown = md(html)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(f"---\n")
        f.write(f"title: \"{title}\"\n")
        f.write(f"original_url: \"{url}\"\n")
        f.write(f"downloaded_at: \"{datetime.now().isoformat()}\"\n")
        f.write(f"---\n\n")
        f.write(markdown)

    metadata.append({
        "title": title,
        "filename": f"{filename}.md",
        "original_url": url,
        "downloaded_at": datetime.now().isoformat()
    })

    # Recursively crawl all links found on the page (not just main content)
    links = await extract_internal_links(page)
    for link in links:
        if link not in visited:
            await crawl_page(page, link)


In [37]:
# main method 
async def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    global visited, metadata

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        await crawl_page(page, BASE_URL)

        with open(METADATA_FILE, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2)

        logging.info(f"\n✅ Completed. {len(metadata)} pages saved.")
        await browser.close()

if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()

    asyncio.run(main())