# QS

### Setting Up

In [4]:
# pip install selenium

In [5]:
# import libraries

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException

import time
import pandas as pd
import pprint
import datetime as dt
year = dt.datetime.now().year

# define wait times for webscrape to run
def wait():
    time.sleep(3)
def long_wait():
    time.sleep(5)

# click on accept cookies button
def close_cookie(driver):
    try:
        close_btn = driver.find_element(By.XPATH, "//div[@class='eu-cookie-compliance-buttons']")
        close_btn.click()
        wait()
    except:
        print("Cookie button not found")


In [6]:
# setting up chrome driver (website to scrape)
def setup(year):
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    prefs = {"download.default_directory": ""}
    chrome_options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://www.topuniversities.com/university-rankings/university-subject-rankings/{year}/computer-science-information-systems?&tab=indicators&sort_by=overallscore&order_by=desc")
    driver.maximize_window()
    long_wait()
    close_cookie(driver)

    # Check if the page contains the "Coming Soon!" error message
    if 'Coming Soon!' in driver.page_source:
        year -= 1
        driver.quit()
        return setup(year)  # Recursively call setup() with the decremented year

    return driver


In [9]:
# button to load more data
def click_load_more(driver):
    while True:
        try:
            # Check if the "No data found" message is present
            time.sleep(5)
            no_data_msg = driver.find_element(By.XPATH, "//div[contains(text(), 'No data found on applied filters')]")
            break  # Stop clicking the "Load More" button if the message is found
        except NoSuchElementException:
            pass

        try:
            load_more_btn = driver.find_element(By.XPATH, "//button[contains(@class, 'loadmorebutton') and contains(text(), 'Load More')]")
            driver.execute_script("arguments[0].scrollIntoView();", load_more_btn)
            driver.execute_script("arguments[0].click();", load_more_btn)
            wait()
        except NoSuchElementException:
            # If the "Load More" button is not found, break the loop
            break

driver = setup(year)
click_load_more(driver)



WebDriverException: ignored

In [None]:
# button to scroll right on data table to get other criterias
def table_click_right(driver):
  right_arrow = driver.find_elements(By.XPATH, "//span[@direction='right']")
  right_arrow[0].click()
  wait()


### Extracting Data

In [None]:
# gets full table of data through the class name
data = driver.find_elements(By.XPATH, "//*[@class='td-wrap-in']")
# for i in range(len(data)):
#     print(i, data[i].text)

In [None]:
# get school names
schools_data = list()
for i in range(1, len(data), 8):
    schools_data.append(data[i].text)
# schools_data

In [None]:
# scroll right of table to get more criterias
table_click_right(driver)

In [None]:
# get Academic Reputation scores
AR_data = list()
for i in range(3, len(data), 8):
    AR_data.append(data[i].text)
# AR_data

In [None]:
# get Employer Reputation scores
ER_data = list()
for i in range(4, len(data), 8):
    ER_data.append(data[i].text)
# ER_data

In [None]:
# get Citations scores
citations_data = list()
for i in range(5, len(data), 8):
    citations_data.append(data[i].text)
# citations_data

In [None]:
# get locations of schools
def extract_location(driver):
    locations = driver.find_elements(By.XPATH, "//div[contains(@class, 'location')]")
    location_data = []
    for location in locations:
        location_text = location.text.strip()
        if location_text:
            location_data.append(location_text)
    return location_data

location_data = extract_location(driver)
# print(location_data)

### Creating Dataframe

In [None]:
df_columns = ["Country", "University", "QS Citations per Paper"]
df = pd.DataFrame(columns=df_columns)

time.sleep(20)

In [None]:
# populating data into dataframe
df["Country"] = location_data
df["University"] = schools_data
df["QS Citations per Paper"] = citations_data
df["QS Academic Reputation"] = AR_data
df["QS Employer Reputation"] = ER_data

In [None]:
# final dataframe
# df

In [None]:
city_list = []
country_list = []

for location in location_data:
    # Check if the location contains a comma
    if ',' in location:
        # Split the string at the comma
        split_location = location.split(',')

        # Remove leading and trailing whitespaces from the city and country
        city = split_location[0].strip()
        country = split_location[1].strip()

        city_list.append(city)
        country_list.append(country)
    else:
        city_list.append(location)
        country_list.append(None)
# print(city_list)
# print(country_list)

In [None]:
# Replace the values in the "Country" column
df['Country'] = df['Country'].replace("China", "China (Mainland)")
df['Country'] = country_list
df = df[['University','Country', "QS Citations per Paper",
         "QS Academic Reputation", "QS Employer Reputation"]]
df_qs = df

In [None]:
df_qs.to_excel('QS.xlsx')