In [2]:
from pathlib import Path

In [74]:
root = Path()
children = list(root.iterdir())
course_dirs = list(
    filter(
        lambda x: x.is_dir() and x.name not in [".git", ".DS_Store"],
        children,
    )
)
course_names = list(map(lambda x: x.name, course_dirs))

In [75]:
import json


def extract_data_from_json(file_path):
    try:
        # Open and read the JSON file
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"The file {file_path} was not found.")
    except json.JSONDecodeError:
        print(f"Error decoding JSON from the file {file_path}.")

In [76]:
from bs4 import BeautifulSoup
import re


def extract_urls_from_html(file_path):
    # Read the HTML file
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all tags that might contain URLs
    tags_with_urls = soup.find_all(
        ["a", "img", "link", "script"], href=True
    ) + soup.find_all("img", src=True)

    # Regular expression to match URLs
    url_pattern = re.compile(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    )

    # Extract and store URLs
    urls = []
    for tag in tags_with_urls:
        # Check and extract 'href' or 'src' attributes
        url = tag.get("href") or tag.get("src")
        if url and url_pattern.match(url):
            urls.append(url)

    return urls


def extract_a_tag_contents(file_path):
    # Read the HTML file
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all <a> tags
    a_tags = soup.find_all("a")

    # Extract and store text content of each <a> tag
    a_tag_contents = [tag.get_text().strip() for tag in a_tags]

    return a_tag_contents


def extract_a_tag_hrefs(file_path):
    # Read the HTML file
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all <a> tags
    a_tags = soup.find_all("a")

    # Extract and store href attribute of each <a> tag
    a_tag_hrefs = [tag.get("href") for tag in a_tags if tag.get("href")]

    return a_tag_hrefs

In [142]:
exam_web_paths = {}

for course_dir in course_dirs:
    print(f"Processing {course_dir.name}...")
    metadata_path = course_dir / "metadata.json"
    url_path = course_dir / "exam_table.html"
    if url_path.exists():
        hrefs = extract_a_tag_hrefs(url_path)
        hrefs = [
            href
            for href in hrefs
            if "instructor" not in href and "youtube" not in href and "tbp" not in href
        ]
        urls = []
        for path in hrefs:
            if "http" and "://" in path:
                urls.append(path)
            else:
                metadata = extract_data_from_json(metadata_path)
                web_extension = metadata["Exam Prepend"]
                if web_extension[-1] == "/":
                    web_extension = web_extension[:-1]
                if path[0] != "/":
                    path = "/" + path
                path = path.replace("../", "")
                urls.append(web_extension + path)

        urls = [url for url in urls if "instructor" not in url]
        exam_web_paths[course_dir.name] = urls

Processing data101...
Processing eecs105...
Processing cs170...
Processing data100...
Processing cs184...
Processing eecs16a...
Processing cs61b...
Processing cs61c...
Processing data8...
Processing cs164...
Processing eecs120...
Processing cs162...
Processing cs186...
Processing cs172...
Processing data102...
Processing cs188...
Processing eecs152...
Processing cs88...
Processing eecs16b...
Processing cs189...
Processing cs10...
Processing cs174...
Processing cs61a...
Processing eecs70...
Processing cs169...
Processing cs161...
Processing cs150...


In [143]:
import requests


def download_pdf(url, output_path):
    try:
        # Send a GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Write the content of the response to a file
            with open(output_path, "wb") as file:
                file.write(response.content)
            print(f"PDF successfully downloaded to {output_path}")
            return True
        else:
            print(f"Failed to download PDF. Status code: {response.status_code}")
            return False
    except requests.RequestException as e:
        print(f"An error occurred while downloading the PDF: {e}")
        return False

In [151]:
def write_urls_to_pdfs(urls, class_name):
    # Create the output directory if it doesn't exist
    exams_path = Path(f"./{class_name}/exams")
    if not exams_path.exists():
        exams_path.mkdir(parents=True, exist_ok=True)

    failed_downloads = []
    # Download each PDF
    counter = 0
    for url in urls:
        # Extract the filename from the URL
        filename = url.split("/")[-1]

        if "drive.google.com" in url or "docs.google.com" in url:
            filename = url.split("/")[-2] + ".pdf"

        filename = str(counter) + "_" + filename

        # Download the PDF
        if not (exams_path / filename).exists():
            is_downloaded = download_pdf(url, exams_path / filename)

            if not is_downloaded:
                failed_downloads.append(url)
            else:
                counter += 1

    print(f"\n{class_name} processed successfully\n")
    return failed_downloads

In [152]:
failed_links = {}
for course_name in sorted(course_names):
    if course_name not in exam_web_paths:
        continue
    urls = exam_web_paths[course_name]
    failed_downloads = write_urls_to_pdfs(urls, course_name)
    failed_links[course_name] = failed_downloads

PDF successfully downloaded to cs10/exams/0_10jmfiZPNDI5uQI05owrF5zL9nlrQ6f7E.pdf
PDF successfully downloaded to cs10/exams/1_15JVkLNUjx5y3fEbBKaUQRnoLZraag52Y.pdf
PDF successfully downloaded to cs10/exams/2_1G19VfF5RIOXfHoZR1aqrJYJ5Msf6XXiy.pdf
PDF successfully downloaded to cs10/exams/3_1KeQzZ3XFwbtJ2dPWtPxe5nHtoQ3tqDpp.pdf
PDF successfully downloaded to cs10/exams/4_1ihEmzHacBgA9sMtoYj9fKyUJUUrPEqhu.pdf
PDF successfully downloaded to cs10/exams/5_1-iJmT8cYw3H96Gl-dnUoh9U1swqNnLub9n9bTsrDO9k.pdf
PDF successfully downloaded to cs10/exams/6_1IDzaiaCJX3mElz37F7FfP7EdtMwJW9Mm.pdf
PDF successfully downloaded to cs10/exams/7_1vIYrShUM5lpBX1MyRgiPG20T9hjIKIS5.pdf
PDF successfully downloaded to cs10/exams/8_1t69ewQCLvnLY17BU-z5eFsGlo8CPzJSH05JnYKLbqSI.pdf
PDF successfully downloaded to cs10/exams/9_1uhJJu_QGLzTyzpKYPHu16IkQYaEWdRxC.pdf
PDF successfully downloaded to cs10/exams/10_1VFS3_ycsXD3l6QVRSZJK6yOjKOHlB8BE.pdf
PDF successfully downloaded to cs10/exams/11_xlNPgBWuNx2UDx0iWePuXKVa
PDF s

KeyError: 'cs170'

In [None]:
failed_links

{'cs10': ['https://drive.google.com/file/d/13yRMg8UbTQpS-X7u78j7MhXx9OUhrt82/view',
  'https://drive.google.com/file/d/1s9n_CoJB5RpaNuUmxPBRxVfXgPOQLWIz/view',
  'https://drive.google.com/file/d/1CO4AartRWdyw-vIgIS1lzzcqgDSbFB9S/view',
  'https://drive.google.com/file/d/1G_blwdl9uMn2jTvMjbs7ZVy2Hnc5JJiS/view?usp=sharing',
  'https://drive.google.com/file/d/1RNL6ChgNuiQ00ngHOBjoVq4I8UEOlzzL/view?usp=sharing',
  'https://drive.google.com/file/d/1OyR9Zk3q11Xi6xqzQSmq8QdVb3OVHjIW/view?usp=sharing',
  'https://drive.google.com/file/d/1lSs3TXCJrqB22zgLBByZTI1-5ITqfE9K/view',
  'https://drive.google.com/file/d/1IXByRzNhPgjmiDDb9kufU4x02Gx2xI9C/view?usp=sharing',
  'https://drive.google.com/file/d/192pX5wig1Y-2p9OVktznTe0Obf5faode/view?usp=sharing',
  'https://drive.google.com/file/d/1ZuWs6FWDZAsVCJSDLiPXIfLePhm-eBfT/view?usp=sharing']}

In [None]:
with open("./failed_download_links.json", "w", encoding="utf-8") as file:
    json.dump(failed_links, file, ensure_ascii=False, indent=4)