In [1]:
from bs4 import BeautifulSoup
import markdownify


def extract_text_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, "lxml")

    # Strip these tags out
    [x.extract() for x in soup.findAll(["style", "script"])]

    # Use a single line of html text for the markdown parser
    html_content = str(soup).replace("\n", "")

    return markdownify.markdownify(html_content, heading_style="ATX")


text = extract_text_from_html("./raw_syllabi/2021-12_532.html")

with open(f"./parsed_syllabi/2021-12_532.md", "w", encoding="utf-8") as file:
    file.write(text)

In [2]:
import fitz  # PyMuPDF


def extract_text_from_pdf(file_path):
    with fitz.open(file_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()

    return text

In [3]:
course_records = [
    ("2021-09", "505"),
    ("2021-10", "502"),
    ("2021-10", "515"),
    ("2021-11", "521"), # This one is listed under week 1 and doesn't have a reference in "resources"
    ("2021-12", "532"),
    ("2022-01", "501"), # pdf
    ("2022-01", "511"),
    ("2022-02", "522"),
    ("2022-03", "503"),
    ("2022-04", "523"),
    ("2022-05", "542"), # This one is listed under week 1 materials
    ("2022-06", "543"), # This one is listed under week 1 materials
    ("2022-06", "611"),
    ("2022-07", "516"),
    ("2022-08", "622"),
    ("2022-09", "593"),
    ("2022-11", "631"),
    ("2023-01", "652"), # pdf
    ("2023-02", "632"),
    ("2023-02", "673"), # This was just a link to a Google Doc, which I downloaded as a pdf
    ("2023-03", "642"),
    ("2023-03", "643"), # This was just a link to a Google Doc, which I downloaded as a pdf
    ("2023-04", "655"), # This one is listed under week 1 materials
    ("2023-05", "524"),
    ("2023-06", "601"),
    ("2023-06", "630"),
    ("2023-09", "696"), # This is a pdf on in a Google Drive folder that I no longer have access to, luckily I kept a copy that I downloaded during the class
    ("2023-11", "682"),
    ("2023-12", "688"),
    ("2024-02", "699"), # This was just a link to a Google Doc, which I downloaded as a pdf
]

courses = [f"{date}_{num}" for date, num in course_records]

In [4]:
import os


def extract_text(file_identifier, base_path="./raw_syllabi/"):
    html_path = os.path.join(base_path, f"{file_identifier}.html")
    pdf_path = os.path.join(base_path, f"{file_identifier}.pdf")

    if os.path.exists(html_path):
        return extract_text_from_html(html_path)
    elif os.path.exists(pdf_path):
        return extract_text_from_pdf(pdf_path)
    else:
        return f"Course Syllabus does not exist at: {base_path}{file_identifier}"

In [5]:
for course in courses:
    text = extract_text(course)

    with open(f"./parsed_syllabi/{course}.md", "w", encoding="utf-8") as file:
        file.write(text)

In [6]:
!npx prettier ./parsed_syllabi/ --write

[K[?25h░░░░⠂⠂⠂⠂⠂⠂⸩ ⠼ build:prettier: sill doSerial build 8[0m[K/.npm/_locks[0m[K[0m[Knpx: installed 1 in 1.444s
parsed_syllabi/2021-09_505.md[2K[1Gparsed_syllabi/2021-09_505.md 94ms
parsed_syllabi/2021-09_505.txt[2K[1Gparsed_syllabi/2021-10_502.md[2K[1Gparsed_syllabi/2021-10_502.md 75ms
parsed_syllabi/2021-10_502.txt[2K[1Gparsed_syllabi/2021-10_515.md[2K[1Gparsed_syllabi/2021-10_515.md 58ms
parsed_syllabi/2021-10_515.txt[2K[1Gparsed_syllabi/2021-11_521.md[2K[1Gparsed_syllabi/2021-11_521.md 37ms
parsed_syllabi/2021-11_521.txt[2K[1Gparsed_syllabi/2021-12_532.md[2K[1Gparsed_syllabi/2021-12_532.md 48ms
parsed_syllabi/2021-12_532.txt[2K[1Gparsed_syllabi/2022-01_501.md[2K[1Gparsed_syllabi/2022-01_501.md 39ms
parsed_syllabi/2022-01_501.txt[2K[1Gparsed_syllabi/2022-01_511.md[2K[1Gparsed_syllabi/2022-01_511.md 41ms
parsed_syllabi/2022-01_511.txt[2K[1Gparsed_syllabi/2022-02_522.md[2K[1Gparsed_syllabi/2022-02_522.md 41ms
parsed_syllabi/2022-02_522.txt[2K[1Gp