In [1]:
import requests
import bs4
import pandas as pd
import time

In [2]:
def get_pagination_links(url):
    """Download a list of links from a Uni St. Gallen 'people' page

    Args:
      - url: The link to the page 'people' page

    Returns:
      A list of URLs for each page
    """
    
    response = requests.get(url)
    response.raise_for_status()

    soup = bs4.BeautifulSoup(response.content)
    pagination_div = soup.find("div", class_="g-pagination")

    link_tags = pagination_div.find_all("a")
    links = []
    for tag in link_tags:
        if tag["href"] not in links:
            links.append("https://www.unisg.ch" + tag["href"])

    return links

In [3]:
def get_people_info(page_url):
    """Opens a 'persons' page and scrapes some data from it

    Args:
      - page_url: The link to the page 'people' page
    
    Returns:
      A list of dictionaries containing the downloaded data

    """

    response = requests.get(page_url)
    response.raise_for_status()

    soup = bs4.BeautifulSoup(response.content)
    people_div = soup.find("div", class_="g-person-container")

    people_box_tags = people_div.find_all(class_="e-person-content")

    person_data = [extract_person_info(box_tag) for box_tag in people_box_tags]

    return person_data


def extract_person_info(tag):
    """Extracts useful info from a person's box on the page

    Args:
      tag: a BeautifulSoup Tag pretaining to the div for the person's box
    
    Returns:
      A dictionary containing some of the person's data
    """

    name = tag.find("h3").text
    title = tag.find("span", class_="e-person-academic").text
    role = tag.find("div", class_="e-person-role").text.strip()
    link = tag.find("div", class_="e-person-title").find("a")["href"]

    data = {
        "name": name,
        "title": title,
        "role": role,
        "link": link
    }

    return data

In [4]:
def scrape_person_data(url, wait_time=1):
    """Scrape everyones' data (all pages) from a Uni St. Gallen 'people' page

    Args: 
      - url: The link to the page 'people' page
      - wait_time: Time to wait between page downloads

    Returns:
      A pandas DataFrame containing the scraped data
    """
    
    links = get_pagination_links(url)
    time.sleep(wait_time)

    persons = []
    for link in links:
        persons.extend(get_people_info(link))
        time.sleep(wait_time)

    return pd.DataFrame(persons)

In [5]:
profs = scrape_person_data("https://www.unisg.ch/en/universitaet/schools/economics-and-political-science/ueber-seps/dozierende/professoren")

In [6]:
assistant_profs = scrape_person_data("https://www.unisg.ch/en/universitaet/schools/economics-and-political-science/ueber-seps/dozierende/assistenzprofessoren")

In [7]:
everyone = pd.concat([profs, assistant_profs], ignore_index=True)
everyone.head()

Unnamed: 0,name,title,role,link
0,Francesco Audrino,Prof. Ph.D.,Professor of Statistics,https://www.unisg.ch/en/personenverzeichnis/44...
1,Johannes Binswanger,Prof. Dr.,Professor of Business Economics and Public Policy,https://www.unisg.ch/en/personenverzeichnis/22...
2,Timo Boppart,Prof. PhD,Professor for International Economics,https://www.unisg.ch/en/personenverzeichnis/fb...
3,Stefan Bühler,Prof. Dr.,Professor of Applied Microeconomics,https://www.unisg.ch/en/personenverzeichnis/62...
4,Guido Cozzi,Prof. PhD,Professor of Macroeconomics,https://www.unisg.ch/en/personenverzeichnis/79...


In [8]:
everyone.to_csv("st_gallen_faculty.csv")