In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pandas as pd
from html import escape
import ast 
import itertools

In [2]:
def lecture_dataset(base_url):
    response = requests.get(base_url)
    # Check if the request was successful
    soup = BeautifulSoup(response.content, "html.parser")

    # Locate all <dd> elements containing a <strong> with class="label label-lecture"
    main_events = [
        dd.find("p") for dd in soup.find_all("dd", class_="module-event main")
        if dd.find("strong", class_="label label-lecture") is not None
    ]

    for main in main_events:
        if (main.next_sibling.name == 'dt') | (main.next_sibling.name == 'dd'):
            main_events.remove(main)
    main_events.pop(23)

    additional = soup.find_all("span")
    # Prepare lists to store data
    lecture_numbers = []
    lecture_names = []
    content_keywords = []
    lecture_notes_urls = []

    # Extract data from main events
    for main in main_events:
        # Lecture Number
        lecture_number = main.find("strong", class_="label label-lecture").text.replace("LEC", "").strip()
        lecture_numbers.append(lecture_number)

        # Lecture Name
        lecture_name = main.find("a").text.strip()
        lecture_names.append(lecture_name)

        # Lecture Notes URL
        notes_path = main.find_all("a")[1]["href"]
        full_notes_url = base_url + notes_path
        lecture_notes_urls.append(full_notes_url)
    for add in additional:
        keywords_text = add.text.replace("Keywords:", "").strip()
        content_keywords.append(keywords_text)
    content_keywords.insert(13, "None")
    content_keywords.append("None")
    content_keywords.append("None")
    # Create a DataFrame
    data = {
        "Lecture Number": lecture_numbers,
        "Lecture Name": lecture_names,
        "Content Keywords": content_keywords,
        "Lecture Notes URL": lecture_notes_urls
    }
    return pd.DataFrame(data)

    # Save to a CSV file
#     output_file = "lecture_notes_dataset.csv"
#     df.to_csv(output_file, index=False)
#     print(f"Dataset successfully saved to {output_file}!")


In [3]:
# Helper function to process table elements
def process_table_to_string(table):
    """
    Processes an HTML table element into a string representation.
    Each row and cell is included, preserving the structure.
    """
    rows = []
    for row in table.find_all("tr"):
        cells = [cell.get_text(strip=True) for cell in row.find_all(["th", "td"])]
        rows.append(cells)
    return rows

# Single lecture URL
#lecture_url = "https://dsc-courses.github.io/dsc10-2024-sp/resources/lectures/lec01/lec01.html"

def extract_lecture_content(lecture_url):
    # Fetch the lecture page
    response = requests.get(lecture_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the main container (assume it's the body for this example)
        content_container = soup.body

        # Initialize a list to preserve the order of elements
        ordered_elements = []

        # Loop through all children in the container
        for child in content_container.descendants:
            if (child.name in ["h1", "h2", "h3", "h4", "img", "p", "li", "div", "table"]):
                if child.name == "img" and child.has_attr("src"):
                    # For images, save the resolved absolute URL
                    src = child["src"]
                    img_url = src if src.startswith("http") else requests.compat.urljoin(lecture_url, src)
                    ordered_elements.append({"tag": "img", "content": img_url})
                elif child.name == "div":
                    if child.find("pre"):
                        if 'jp-Cell-inputWrapper' in list(itertools.chain(child["class"])):
                            element = child.find("pre").text.strip()
                            ordered_elements.append({"tag": "pre input", "content": element})
                        elif 'jp-Cell-outputWrapper' in list(itertools.chain(child["class"])):
                            element = child.find("pre").text.strip()
                            ordered_elements.append({"tag": "pre output", "content": element})
                    else:
                        continue
                elif child.name == "table":
                    # Process table into a structured representation
                    table_content = process_table_to_string(child)
                    ordered_elements.append({"tag": "table", "content": table_content})
                else:
                    # Clean text content: remove ¶ and strip whitespace
                    clean_text = child.text.replace("¶", "").strip()
                    ordered_elements.append({"tag": child.name, "content": clean_text})

        # Convert the ordered elements into a DataFrame
        df = pd.DataFrame(ordered_elements)
        df['linebreaks'] = df['content'].apply(num_linebreaks)
        duplicates = []
        for index, row in df.iterrows():
            if row["tag"] == "li" and row["linebreaks"] > 0:
                duplicates.extend(range(index + 1, index + 1 + row["linebreaks"]))
        return df.drop(duplicates)[['tag','content']]

#         # Save the data to a CSV file
#         output_file = "lec01_ordered_contents.csv"
#         df.to_csv(output_file, index=False)

#         print(f"Lecture contents successfully saved to {output_file}!")
#     else:
#         print(f"Failed to fetch the page: {lecture_url}, status code: {response.status_code}")


In [4]:
def num_linebreaks(content):
    return content.count('\n')

In [5]:

# Helper function to process tables
def process_table_to_HTML(content):
    """
    Converts a list of lists (table data) into an HTML table.
    """
    try:
        table_data = ast.literal_eval(content)  # Safely evaluate the string representation of lists
        html_result = "<table>\n"
        for i, row in enumerate(table_data):
            tag = "th" if i == 0 else "td"  # Use <th> for the first row (header), <td> for others
            html_result += "<tr>\n"
            for cell in row:
                html_result += f"<{tag}>{escape(cell)}</{tag}>\n"
            html_result += "</tr>\n"
        html_result += "</table>\n"
        return html_result
    except Exception as e:
        print(f"Error processing table: {e}")
        return "<p>Error rendering table</p>\n"

# Helper function to process list items
def process_list_items(content):
    """
    Processes list items to handle nested bullets.
    If `\n` is present in the content, create nested bullets.
    Otherwise, create simple default bullets.
    """
    lines = content.split("\n")  # Split by newline for potential nested bullets
    html_result = ""

    if len(lines) == 1:
        # Simple bullet (no nesting)
        html_result += f"<ul>\n<li>{escape(lines[0].strip())}</li></ul>\n"
    else:
        # Nested bullets
        html_result += f"<ul>\n<li>{escape(lines[0].strip())}\n"
        html_result += "<ul>\n"
        for nested_line in lines[1:]:
            html_result += f"<li>{escape(nested_line.strip())}</li>\n"
        html_result += "</ul>\n</li>\n</ul>\n"

    return html_result

def generate_html(input_csv, output_html):
    # Load the CSV into a DataFrame
    df = pd.read_csv(input_csv)

    # Start building the HTML structure with CSS for better styling
    html_content = """<!DOCTYPE html>
    <html>
    <head>
    <meta charset="utf-8">
    <title>Lecture 1 Content</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            line-height: 1.6;
        }
        img {
            max-width: 70%;
            height: auto;
            display: block;
            margin: 10px 0;
        }
        ul, ol {
            padding-left: 20px;
            margin-bottom: 1em;
        }
        ul ul, ol ol {
            padding-left: 20px;
        }
        li {
            margin-bottom: 0.5em;
        }
        .section {
            margin-bottom: 20px;
            padding: 10px;
            border: 1px solid #ddd;
            border-radius: 5px;
        }
        /* Styling for Jupyter Notebook-style code cells */
        pre.code {
            background-color: #f7f7f7;
            border: 1px solid #ddd;
            border-radius: 5px;
            padding: 10px;
            margin: 10px 0;
            font-family: "Courier New", Courier, monospace;
            font-size: 10px;
            color: #333;
            white-space: pre-wrap;
        }
        /* Styling for Jupyter Notebook-style output cells */
        pre.output {
            font-size: 10px;
            color: #555;
            white-space: pre-wrap;
        }
        /* Styling for tables */
        table {
            width: 50%;
            border-collapse: collapse;
            margin: 20px 0;
        }
        th, td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        th {
            background-color: #f2f2f2;
        }
    </style>
    </head>
    <body>
    """

    # Track whether a section is open
    section_open = False

    # Loop through the rows of the DataFrame and construct HTML elements
    for _, row in df.iterrows():
        tag = row['tag']
        content = row['content']

        # Close the previous section and start a new one for h1, h2, or h3 tags
        if tag in {"h1", "h2", "h3"}:
            if section_open:
                html_content += "</div>\n"  # Close the current section
            html_content += f'<div class="section">\n<{tag}>{escape(content)}</{tag}>\n'
            section_open = True
        else:
            # Handle images
            if tag == "img":
                html_content += f'<img src="{escape(content)}" alt="Image">\n'
            # Handle code input
            elif tag == "pre input":
                html_content += f"<pre class='code'>{escape(content)}</pre>\n"
            # Handle code output
            elif tag == "pre output":
                html_content += f"<pre class='output'>{escape(content)}</pre>\n"
            # Handle unordered lists
            elif tag == "li":
                html_content += process_list_items(content)
            # Handle tables
            elif tag == "table":
                html_content += process_table_to_HTML(content)
            # Handle other tags
            else:
                clean_content = content.replace("¶", "").strip()
                html_content += f"<{tag}>{escape(clean_content)}</{tag}>\n"

    # Close the last open section, if any
    if section_open:
        html_content += "</div>\n"

    # Close the HTML structure
    html_content += "</body>\n</html>"
    
    filepath = f"html_files/{output_html}"
    
    # Write the HTML content to the output file using UTF-8 encoding
    with open(filepath, "w", encoding="utf-8") as file:
        file.write(html_content)

    print(f"HTML file successfully created at {filepath}!")


In [6]:
def lecture_content_for_question_generation(input_html, output_csv):

    # Read the HTML file
    with open(input_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Extract the lecture number and title from the first <h1> tag
    h1_tag = soup.find("h1")
    if h1_tag:
        lecture_header = h1_tag.text.strip()
        # Extract lecture number and title from the <h1> content
        if "–" in lecture_header:
            lecture_number = lecture_header.split("–")[0].replace("Lecture ", "").strip()
            lecture_title = lecture_header.split("–")[1].strip()
        else:
            lecture_number = ""
            lecture_title = lecture_header
    else:
        lecture_number = ""
        lecture_title = ""

    # Extract sections based on <div class="section">
    sections = []
    for section in soup.find_all("div", class_="section"):
        # Extract the section name (from h2 or h3, not h1)
        section_name_tag = section.find(["h2", "h3"])
        section_name = section_name_tag.text.strip() if section_name_tag else ""

        # Gather text content from the div, excluding images and section name
        section_text = []
        for tag in section.find_all():
            if tag.name == "pre" and "class" in tag.attrs:
                # Check for code or output classes
                if "code" in tag["class"]:
                    section_text.append(f"Input: {tag.text.strip()}")
                elif "output" in tag["class"]:
                    section_text.append(f"Output: {tag.text.strip()}")
            elif tag not in {section_name_tag, h1_tag} and tag.name not in {"img", "ul"}:
                # Exclude section name and h1 content, include other text
                section_text.append(tag.text.strip())

        # Combine the section text
        combined_text = " ".join(section_text).strip()

        # Add the section details to the list if the section text is not empty
        if combined_text:
            sections.append({
                "lecture_number": lecture_number,
                "lecture_title": lecture_title,
                "section_name": section_name,
                "section": combined_text
            })

    # Create a DataFrame
    df = pd.DataFrame(sections)

    filepath = f"lecture_notes/{output_csv}"
        
    # Save the dataset to a CSV file
    df.to_csv(output_csv, index=False, encoding="utf-8")

    print(f"Dataset successfully created and saved to {output_csv}!")


In [7]:
# Step 1. Create a database of all lecture topics
# URL of the website to scrape
base_url = "https://dsc-courses.github.io/dsc10-2024-sp/"
lectures = lecture_dataset(base_url)

In [13]:
# Step 2. Scrape each individual lecture for key content
lecture_content_output_csv = []
for i in lectures['Lecture Notes URL']:
    if i.split('/')[-2] not in {'lec11', 'lec14', 'lec27'}:
        lecture_content = extract_lecture_content(i)
        output_file = f"{i.split('/')[-2]}_ordered_contents.csv"
        filepath = f"web scraping/{output_file}"
        lecture_content.to_csv(filepath, index=False)
        print(f"Lecture contents successfully saved to {filepath}!")
        lecture_content_output_csv.append(filepath)

Lecture contents successfully saved to web scraping/lec01_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec02_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec03_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec04_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec05_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec06_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec07_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec08_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec09_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec10_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec12_ordered_contents.csv!
Lecture contents successfully saved to web scraping/lec13_ordered_contents.csv!
Lecture contents successfully saved to w

In [19]:
# Step 3. Generate HTML code from key lecture content, grouped by h3 section
html_files = []
for csv in lecture_content_output_csv:
    if csv not in {"web scraping/lec02_ordered_contents.csv", "web scraping/lec05_ordered_contents.csv","web scraping/lec09_ordered_contents.csv", "web scraping/lec10_ordered_contents.csv", "web scraping/lec13_ordered_contents.csv", "web scraping/lec21_ordered_contents.csv"}:
        output_file = f"{csv.split('/')[-1].split('_')[0]}_contents.html"
        generate_html(csv, output_file)
        filepath = f"html_files/{output_file}"
        html_files.append(filepath)

HTML file successfully created at html_files/lec01_contents.html!
HTML file successfully created at html_files/lec03_contents.html!
HTML file successfully created at html_files/lec04_contents.html!
HTML file successfully created at html_files/lec06_contents.html!
HTML file successfully created at html_files/lec07_contents.html!
HTML file successfully created at html_files/lec08_contents.html!
HTML file successfully created at html_files/lec12_contents.html!
HTML file successfully created at html_files/lec15_contents.html!
HTML file successfully created at html_files/lec16_contents.html!
HTML file successfully created at html_files/lec17_contents.html!
HTML file successfully created at html_files/lec18_contents.html!
HTML file successfully created at html_files/lec19_contents.html!
HTML file successfully created at html_files/lec20_contents.html!
HTML file successfully created at html_files/lec22_contents.html!
HTML file successfully created at html_files/lec23_contents.html!
HTML file 

In [21]:
# Step 4. Create lecture notes for each lecture, grouped by topic
lecture_notes = []
for html in html_files:
    output_csv = f"{html.split('/')[-1].split('_')[0]}_dataset.csv"
    filepath = f"lecture_notes/{output_csv}"
    lecture_content_for_question_generation(html, filepath)
    lecture_notes.append(filepath)

Dataset successfully created and saved to lecture_notes/lec01_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec03_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec04_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec06_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec07_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec08_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec12_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec15_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec16_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec17_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec18_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec19_dataset.csv!
Dataset successfully created and saved to lecture_notes/lec20_dataset.csv!
Dataset successfully crea