This program will try to explore all the link of a specific type

All imported libraries

In [38]:
import requests
from bs4 import BeautifulSoup
import json
import urllib.parse
from collections import deque
from datetime import datetime
import os

# Define 1GB in bytes
ONE_GB = 1_073_741_824
HUNDRED_MB = 1000000 * 100
# Define 100GB in bytes
MAX_TOTAL_SIZE = 20 * ONE_GB

link_to_explore = "https://www.uni-bamberg.de/en/its/"

In [39]:
def load_visited(file_path):
    """Load visited URLs from a file into a set."""
    visited = set()
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                url = line.strip()
                if url:
                    visited.add(url)
    return visited



In [40]:
def ensure_directories():
    os.makedirs("input/explore", exist_ok=True)
    os.makedirs("output/explore", exist_ok=True)


In [41]:
def save_visited(file_path, visited):
    """Save visited URLs to a file (one URL per line)."""
    with open(file_path, "w", encoding="utf-8") as f:
        for url in sorted(visited):
            f.write(url + "\n")


In [42]:
def load_pending(file_path):
    """Load pending URLs from a file into a deque."""
    pending = deque()
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                url = line.strip()
                if url:
                    pending.append(url)
    return pending



In [43]:
def save_pending(file_path, pending):
    """Save pending URLs to a file (one URL per line)."""
    with open(file_path, "w", encoding="utf-8") as f:
        for url in pending:
            f.write(url + "\n")



In [45]:
def flush_data(data, batch_index):
    """Flush the data into a JSON file and return the filename."""
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_filename = f"output/scraped_data_{timestamp}_batch{batch_index}.json"
    with open(output_filename, "w", encoding="utf-8") as outfile:
        json.dump(data, outfile, ensure_ascii=False, indent=2)
    print(f"Flushed {len(data)} records to {output_filename}")
    return output_filename


In [56]:
def scrape_website(base_url, max_pages=0, visited_file="input/explore/visited_urls.txt", pending_file="input/explore/pending_urls.txt"):
    """
    Scrape the website starting at base_url.

    If max_pages is set to 0, the scraper runs until no more pending links remain.
    Otherwise, it stops after scraping max_pages pages.

    Now stops when total scraped data size reaches 100GB.
    """
    # Load previously visited URLs and pending URLs.
    visited = load_visited(visited_file)
    to_visit = load_pending(pending_file)

    # If base_url is not visited and not in the pending list, add it.
    if base_url not in visited and base_url not in to_visit:
        to_visit.append(base_url)

    data = []  # List to store the scraped data.
    batch_index = 1  # Batch counter for JSON flushing.
    count = 0  # Counter for the number of scraped pages.
    total_scraped_size = 0  # Total size of scraped data in bytes.

    # Continue scraping while there are URLs to visit, the max_pages condition holds,
    # and the total scraped data size is below MAX_TOTAL_SIZE.
    while to_visit and (max_pages == 0 or count < max_pages) and total_scraped_size < MAX_TOTAL_SIZE:
        url = to_visit.popleft()
        if url in visited:
            continue

        print(f"Scraping: {count} {url}")
        visited.add(url)
        try:
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                print(f"Skipping {url} due to response status: {response.status_code}")
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            count += 1

            # Queue the internal links.
            for link in soup.find_all("a"):
                href = link.get("href")
                if href is None or href.startswith("#"):
                    continue
                full_url = urllib.parse.urljoin(url, href)
                if full_url.startswith(base_url) and full_url not in visited and full_url not in to_visit:
                    to_visit.append(full_url)

        except Exception as e:
            print(f"Error scraping {url}: {e}")
            continue

    # Flush any remaining data even if it hasn't reached 1GB.
    if data:
        flush_data(data, batch_index)

    # Save the updated visited URLs and pending URLs.
    save_visited(visited_file, visited)
    save_pending(pending_file, to_visit)
    return count


In [58]:
ensure_directories()

base_url = "https://www.uni-bamberg.de/en/its/"
# Set max_pages to 0 to scrape the whole website, or any positive integer to limit the pages.
max_pages = 0

visited_file = "input/explore/visited_urls.txt"
pending_file = "input/explore/pending_urls.txt"

total_scraped = scrape_website(base_url, max_pages, visited_file, pending_file)
print(f"Scraping completed. Total pages scraped: {total_scraped}.")

Scraping: 0 https://www.uni-bamberg.de/en/its/
Scraping: 1 https://www.uni-bamberg.de/en/its/it-services/
Scraping: 2 https://www.uni-bamberg.de/en/its/it-services/login-roles-rights-iam/
Scraping: 3 https://www.uni-bamberg.de/en/its/dienstleistungen/support/
Scraping: 4 https://www.uni-bamberg.de/en/its/dienstleistungen/support/first-year-first-aid/
Scraping: 5 https://www.uni-bamberg.de/en/its/dienstleistungen/support/setting-up-your-work-place-for-home-office/
Scraping: 6 https://www.uni-bamberg.de/en/its/dienstleistungen/pc-pools/
Scraping: 7 https://www.uni-bamberg.de/en/its/it-services/data-network-wireless-internet-vpn/
Scraping: 8 https://www.uni-bamberg.de/en/its/dienstleistungen/eva/
Scraping: 9 https://www.uni-bamberg.de/en/its/dienstleistungen/pc/
Scraping: 10 https://www.uni-bamberg.de/en/its/dienstleistungen/kurse/
Scraping: 11 https://www.uni-bamberg.de/en/its/it-services/webdienste/
Scraping: 12 https://www.uni-bamberg.de/en/its/wir/
Scraping: 13 https://www.uni-bamberg