This program will try to explore all the link of a specific type

All imported libraries

In [147]:
import requests
from bs4 import BeautifulSoup
import json
import urllib.parse
from collections import deque
from datetime import datetime
import os


In [148]:
def load_visited(file_path):
    """Load visited URLs from a file into a set."""
    visited = set()
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                url = line.strip()
                if url:
                    visited.add(url)
    return visited



In [149]:
def ensure_directories():
    os.makedirs("../input/links", exist_ok=True)
    os.makedirs("../output/links", exist_ok=True)


In [150]:
def save_visited(file_path, visited):
    print("Save the visited URLs")
    """Save visited URLs to a file (one URL per line)."""
    with open(file_path, "w", encoding="utf-8") as f:
        for url in sorted(visited):
            f.write(url + "\n")


In [151]:
def load_pending(file_path):
    """Load pending URLs from a file into a deque."""
    pending = deque()
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                url = line.strip()
                if url:
                    pending.append(url)
    return pending



In [152]:
def save_pending(file_path, pending):
    print("Save the pending URLs")
    """Save pending URLs to a file (one URL per line)."""
    with open(file_path, "w", encoding="utf-8") as f:
        for url in pending:
            f.write(url + "\n")



In [153]:
import re

# Regular expression pattern to match localized URLs
# This matches URLs where the path starts with a 2-letter language code
pattern = r"^https://www\.uni-bamberg\.de/([a-z]{2})/.*$"

# Function to check if a URL is localized
def is_localized_url(url):
    return bool(re.match(pattern, url))

# Regular expression to match URLs ending with common file extensions
# This pattern matches URLs that end with a file extension (3-4 characters after a dot)
file_pattern = r"^https://www\.uni-bamberg\.de/.*\.([a-zA-Z0-9]{2,5})$"

# If you want to match specific file types, you can use this more explicit pattern
specific_file_pattern = r"^https://www\.uni-bamberg\.de/.*\.(pdf|doc|docx|xls|xlsx|ppt|pptx|jpg|jpeg|png|gif|webp|txt|csv|zip|rar)$"

# Function to check if a URL points to a file
def is_file_url(url, pattern=specific_file_pattern):
    return bool(re.match(pattern, url))

In [154]:
def scrape_website(base_url, max_pages=0, visited_file="input/links/visited_urls.txt", pending_file="input/links/pending_urls.txt"):
    """
    Scrape the website starting at base_url.

    If max_pages is set to 0, the scraper runs until no more pending links remain.
    Otherwise, it stops after scraping max_pages pages.

    Now stops when total scraped data size reaches 100GB.
    """
    # Load previously visited URLs and pending URLs.
    visited = load_visited(visited_file)
    to_visit = load_pending(pending_file)

    # If base_url is not visited and not in the pending list, add it.
    if base_url not in visited and base_url not in to_visit:
        to_visit.append(base_url)

    count = 0  # Counter for the number of scraped pages.
    total_scraped_size = 0  # Total size of scraped data in bytes.

    # Continue scraping while there are URLs to visit, the max_pages condition holds,
    # and the total scraped data size is below MAX_TOTAL_SIZE.
    while to_visit and (max_pages == 0 or count < max_pages) and total_scraped_size < MAX_TOTAL_SIZE:
        url = to_visit.popleft()
        if url in visited:
            continue

        print(f"Scraping: {count} {url}")
        visited.add(url)
        try:
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                print(f"Skipping {url} due to response status: {response.status_code}")
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            count += 1

            # Queue the internal links.
            for link in soup.find_all("a"):
                href = link.get("href")
                if href is None or "#" in href or href.endswith(".xml"):
                    continue
                full_url = urllib.parse.urljoin(url, href)
                if is_localized_url(full_url):
                    continue
                if is_file_url(full_url):
                    continue
                if is_file_url(full_url, pattern=file_pattern):
                    continue
                if full_url.startswith(base_url) and full_url not in visited and full_url not in to_visit:
                    to_visit.append(full_url)

                if len(visited) % 2000 == 0:
                    save_visited(visited_file, visited)
                    save_pending(pending_file, to_visit)

        except Exception as e:
            print(f"Error scraping {url}: {e}")
            continue

    # Save the updated visited URLs and pending URLs.
    save_visited(visited_file, visited)
    save_pending(pending_file, to_visit)
    return count


In [155]:
ensure_directories()

base_url = "https://www.uni-bamberg.de/its/"
# Set max_pages to 0 to scrape the whole website, or any positive integer to limit the pages.
max_pages = 0

visited_file = "input/links/visited_urls.txt"
pending_file = "../input/links/pending_urls.txt"

total_scraped = scrape_website(base_url, max_pages, visited_file, pending_file)
print(f"Scraping completed. Total pages scraped: {total_scraped}.")

Scraping: 0 https://www.uni-bamberg.de/its/
Scraping: 1 https://www.uni-bamberg.de/its/verfahrensweisen/
Scraping: 2 https://www.uni-bamberg.de/its/verfahrensweisen/richtlinien/
Scraping: 3 https://www.uni-bamberg.de/its/verfahrensweisen/datenschutz/
Scraping: 4 https://www.uni-bamberg.de/its/verfahrensweisen/datenschutz/datenschutzerklaerungen/
Scraping: 5 https://www.uni-bamberg.de/its/verfahrensweisen/datenschutz/datenschutzerklaerungen/allgemeine-datenschutzerklaerung/
Scraping: 6 https://www.uni-bamberg.de/its/verfahrensweisen/datenschutz/datenschutzerklaerungen/eduroam/
Scraping: 7 https://www.uni-bamberg.de/its/verfahrensweisen/datenschutz/datenschutzerklaerungen/hawki/
Scraping: 8 https://www.uni-bamberg.de/its/verfahrensweisen/datenschutz/datenschutzerklaerungen/microsoft-365/
Scraping: 9 https://www.uni-bamberg.de/its/verfahrensweisen/datenschutz/datenschutzerklaerungen/telekommunikationsanlage/
Scraping: 10 https://www.uni-bamberg.de/its/verfahrensweisen/datenschutz/datensch