# The following codes download the sitemap json and save it to files. Then scrap the website one by one

## Model classes for the scrapper

In [16]:
class SiteData:
    def __init__(self):
        pass


class PageData:
    def __init__(self, url: str, content: str, text: str, images: list):
        self.url = url
        self.content = content
        self.text = text
        self.images = images

    def __str__(self):
        return self.url

    __repr__ = __str__

## Save sitemap xml to json file

In [17]:
import json
import requests
from bs4 import BeautifulSoup, SoupStrainer
from datetime import datetime

content_strainer = SoupStrainer(id="content-main")
def save_site_map_json():
    print('Save site map started...')
    sitemap_index = requests.get("https://www.uni-bamberg.de/sitemap.xml")
    sitemap_strainer = SoupStrainer("sitemap")
    sitemap_link_strainer = SoupStrainer("url")
    sitemap_soup = BeautifulSoup(
        markup=sitemap_index.content, parse_only=sitemap_strainer, features="xml"
    )
    # Die einzelnen Sitemaps und deren letztes Änderungsdatum aus der globalen Sitemap auslesen und als Liste ausgeben
    sitemap_links = list(
        map(
            lambda n: {
                "link": n.loc.text,
                "lastmod": datetime.fromisoformat(n.lastmod.text),
            },
            sitemap_soup.contents,
        )
    )
    site_links = list()
    # Durch die einzelnen Sitemaps durchgehen
    for sitemap_link in sitemap_links:
        sitemap_content = BeautifulSoup(
            markup=requests.get(sitemap_link.get("link")).content,
            parse_only=sitemap_link_strainer,
            features="xml",
        )
        # Die Seiten aus der aktuellen Sitemap auslesen und mit dem Zeitpunkt der letzten Änderung ablegen
        site_links.extend(
            list(
                map(
                    lambda n: {
                        "link": n.loc.text,
                        "lastmod": datetime.fromisoformat(n.lastmod.text),
                    },
                    sitemap_content.contents,
                )
            )
        )
    # Die ausgelesenen Links als JSON abspeicher, damit man nicht jedes Mal die Sitemaps erneut abrufen muss (kann manchmal dauern, da die z.T. wohl erst bei Zugriff ereugt werden)
    f = open("sitemap.json", "w")
    json.dump(site_links, fp=f, default=str)
    f.close()
    print('Site map saved.')


## Load site map from the json file

In [18]:
def load_site_map_json() -> list:
    print("Load site map started...")
    with open(file="sitemap.json", mode="r") as f:
        sitemaps = json.load(f)
    print("Site map loaded.")
    return sitemaps


## Load a site from a given url

In [19]:
def load_site(url: str) -> PageData:
    print(f"Load site started for: {url.get('link')}")
    temp = requests.get(url.get("link")).content
    soup = BeautifulSoup(
        markup=temp,
        parse_only=content_strainer,
        features="lxml",
    )
    try:
        images = list(
            map(
                lambda n: {
                    "src": n.get("src"),
                    "title": n.get("title"),
                    "alt": n.get("alt"),
                },
                soup.findAll("img"),
            )
        )
    except:
        images = list()
    data = PageData(
        url["link"], str(soup.prettify()), soup.get_text(" ", strip=True), images
    )
    print("Site loaded.")
    return data


## Main code to run the scrapper

In [20]:
import time
import pymongo
from multiprocessing import Pool

st = time.time()
pool = Pool()


site_map = load_site_map_json()
if len(site_map) == 0:
    save_site_map_json()
    site_map = load_site_map_json()

# Initiate mongo db

mongo_client = pymongo.MongoClient()

mongo_database = mongo_client["rag"]
mongo_pages = mongo_database["pages"]


timestamp = datetime.now().replace(microsecond=0).isoformat()

timestamp = datetime.now().replace(microsecond=0).isoformat()
for page in pool.imap(load_site, site_map, 100):
    print(page.url)
    mongo_pages.update_one(
        filter={"_id": page.url},
        update={
            "$set": {
                "_id": page.url,
                "last_update": timestamp,
                "content": page.content,
                "text": page.text,
                "images": page.images,
            }
        },
        upsert=True,
    )
    print(f"Content is : {page.content}")
end = time.time()
print((end - st))
pool.close()

Load site started for: https://www.uni-bamberg.de/abt-studium/aufgaben/pruefungs-studienordnungen/bachelorstudiengaenge/geschichte/Load site started for: https://www.uni-bamberg.de/slavling/kontaktnavigation/kontakt/Load site started for: https://www.uni-bamberg.de/impressum/Load site started for: https://www.uni-bamberg.de/universitaet/zahlen-und-fakten/jahresberichte/Load site started for: https://www.uni-bamberg.de/ev-syst/Load site started for: https://www.uni-bamberg.de/teilbibliothek5/Load site started for: https://www.uni-bamberg.de/klinpsych/kontaktnavigation/kontakt/Load site started for: https://www.uni-bamberg.de/anglistik-amerikanistik/studium/anerkennung-von-im-ausland-erbrachten-leistungen/Load site started for: https://www.uni-bamberg.de/entwicklungspsychologie/studium/themen-fuer-abschlussarbeiten-bachelormasterdiplom/Load site started for: https://www.uni-bamberg.de/romling/doktoranden/Load site started for: https://www.uni-bamberg.de/auslandsstudium/ich-moechte-ins-au

ConnectionError: None: Max retries exceeded with url: /abt-studium/aufgaben/pruefungs-studienordnungen/diplom-studiengaenge/soziologie/ (Caused by None)

Load site started for: https://www.uni-bamberg.de/pruefungsamt/flexnow/fn2web/termine/Load site started for: https://www.uni-bamberg.de/sz/wir-ueber-uns/lektoreninnen-von-a-z/formiani-luca/Load site started for: https://www.uni-bamberg.de/drittmittelhaushalt/mitarbeiter/


Load site started for: https://www.uni-bamberg.de/amerikanistik/gastprofessorinnen/smith-2014/Load site started for: https://www.uni-bamberg.de/euroethno/studium/praktikum/museumsdienste/
Load site started for: https://www.uni-bamberg.de/euroethno/lehrstuhl/medien/

Load site started for: https://www.uni-bamberg.de/sz/wir-ueber-uns/akademischer-beirat-des-sprachenzentrums/Load site started for: https://www.uni-bamberg.de/trac/emeriti-of-excellence/doerner/Load site started for: https://www.uni-bamberg.de/pul/forschung/vortraege/


Load site started for: https://www.uni-bamberg.de/bwl-bsl/finance-accounting-tag/1-finance-accounting-tag-2018/Load site started for: https://www.uni-bamberg.de/zemas/forschung/Load site st