In [1]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
import bs4
import time
import pandas as pd

In [2]:
def open_page(url, wait_time=5):

    """Opens a browser, navigates to the given url (has to be a QS ranking page)
    and takes care of the cookie consent dialog.

    Args:
      - url: The URL of the QS ranking page
      - wait time: time to wait after loading the page (seconds)
    """
    
    browser = webdriver.Chrome()
    browser.set_window_size(1600, 900)
    browser.get(url)
    time.sleep(wait_time)

    accept_btn = browser.find_element_by_css_selector("button.agree-button")
    accept_btn.click()

    try:
        browser.find_element_by_class_name("no-ranking-results-found") \
               .find_element_by_tag_name("a") \
               .click()
    except (NoSuchElementException, ElementNotInteractableException):
        pass

    return browser

In [3]:
def parse_page(browser):
    """Parses the data on a paginated page.
    
    Args:
      - browser: A Selenium webdriver with the correct page already open
    
    Returns:
      A list of dictionaries containing the relevant data
    """

    soup = bs4.BeautifulSoup(browser.page_source)
    rows = soup.find_all(class_="_qs-ranking-data-row")

    data = []
    for row in rows:
        rank = row.find_next(class_="_univ-rank").text
        name = row.find_next(class_="uni-link").text
        location = row.find_next(class_="location").text
        score = row.find_next(class_="overall-score-span").text

        data.append({
            "name": name,
            "rank": rank,
            "location": location,
            "score": score
        })

    return data

In [4]:
def next_page(browser, wait_time=5):
    """Navigates to the next paginated page

    Args:
      - browser: A Selenium webdriver with the correct page already open
      - wait time: time to wait after loading the page (seconds)

    Returns:
      None
    """

    next_button = browser.find_element_by_css_selector("a.page-link.next")
    next_button.click()

    time.sleep(wait_time)

In [5]:
def scrape_n_items(url, num_items, wait_time=5):
    """Scrapes at least 'num_item' institutions' data from the QS rankings
    webpage.
    
    Args:
      - url: The URL of the QS ranking page
      - num_items: Minimum number of institutions to scrape
      - wait time: time to wait after loading the pages (seconds)

    Returns:
      A pandas.DataFrame containing the scraped data
    """

    browser = open_page(url, wait_time=wait_time)
    
    data = []
    while len(data) < num_items:
        page_data = parse_page(browser)
        data.extend(page_data)
        next_page(browser, wait_time=wait_time)

    browser.close()        
    
    return pd.DataFrame(data)

In [6]:
uni_df = scrape_n_items(
    url="https://www.topuniversities.com/university-rankings/world-university-rankings/2022",
    num_items=100
)

In [7]:
uni_df.head()

Unnamed: 0,name,rank,location,score
0,Massachusetts Institute of Technology (MIT),1,"Cambridge,United States",100.0
1,University of Oxford,2,"Oxford,United Kingdom",99.5
2,Stanford University,=3,"Stanford,United States",98.7
3,University of Cambridge,=3,"Cambridge,United Kingdom",98.7
4,Harvard University,5,"Cambridge,United States",98.0


In [8]:
uni_df.tail()

Unnamed: 0,name,rank,location,score
95,The University of Sheffield,95,"Sheffield,United Kingdom",61.6
96,Pennsylvania State University,96,"University Park,United States",61.5
97,Sungkyunkwan University(SKKU),97,"Suwon,South Korea",60.5
98,University of Science and Technology of China,98,"Hefei,China (Mainland)",60.1
99,Technical University of Denmark,99,"Kongens Lyngby,Denmark",59.9
