### Context Setting

COMPASS C2 (Qualifications criteria) give EP applications 20 points for coming from a "top tier university". However, there is a lack of representation for SEA schools (Malaysia, Indonesia, Vietnam, Philipines, China, India) in traditional ranking systems

Therefore, the team is creating a new ranking system using Citations, Academic Reputation and Employer Reputation as criterias to better identify reputable universities in SEA.

### QS 

#### Setting Up

In [None]:
# install required libraries (only needed the first time you run; comment out after)
# pip install selenium
# pip install pandas

In [None]:
# import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException

import time
import pandas as pd
import datetime as dt
year = dt.datetime.now().year

# define wait times for webscrape to run
def wait():
    time.sleep(3)
def long_wait():
    time.sleep(5)

# click on accept cookies button
def close_cookie(driver):
    try:
        close_btn = driver.find_element(By.XPATH, "//div[@class='eu-cookie-compliance-buttons']")
        close_btn.click()
        wait()
    except:
        print("Cookie button not found")


In [None]:
# setting up chrome driver (website to scrape)
def setup(year):
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    prefs = {"download.default_directory": ""}
    chrome_options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://www.topuniversities.com/university-rankings/university-subject-rankings/{year}/computer-science-information-systems?&tab=indicators&sort_by=overallscore&order_by=desc")
    driver.maximize_window()
    long_wait()
    close_cookie(driver)

    # Check if the page contains the "Coming Soon!" error message
    if 'Coming Soon!' in driver.page_source:
        year -= 1
        driver.quit()
        return setup(year)  # Recursively call setup() with the decremented year

    return driver

In [None]:
# button to load more data
def click_load_more(driver):
    while True:
        try:
            # Check if the "No data found" message is present
            time.sleep(5)
            no_data_msg = driver.find_element(By.XPATH, "//div[contains(text(), 'No data found on applied filters')]")
            break  # Stop clicking the "Load More" button if the message is found
        except NoSuchElementException:
            pass

        try:
            load_more_btn = driver.find_element(By.XPATH, "//button[contains(@class, 'loadmorebutton') and contains(text(), 'Load More')]")
            driver.execute_script("arguments[0].scrollIntoView();", load_more_btn)
            driver.execute_script("arguments[0].click();", load_more_btn)
            wait()
        except NoSuchElementException:
            # If the "Load More" button is not found, break the loop
            break

In [None]:
# button to scroll right on data table to get other criterias
def table_click_right(driver):
  right_arrow = driver.find_elements(By.XPATH, "//span[@direction='right']")
  right_arrow[0].click()
  wait()

In [None]:
# run driver
driver = setup(year)

In [None]:
# load more data
click_load_more(driver)

#### Extracting Data

In [None]:
# Gets full table of data through the class name
data = driver.find_elements(By.XPATH, "//*[@class='td-wrap-in']")
# for i in range(len(data)):
#     print(i, data[i].text)

In [None]:
# Get school names
schools_data = list()
for i in range(1, len(data), 8):
    schools_data.append(data[i].text)
# schools_data

In [None]:
# Scroll right of table to get more criterias
table_click_right(driver)

In [None]:
# Get Academic Reputation scores
AR_data = list()
for i in range(3, len(data), 8):
    AR_data.append(data[i].text)
# AR_data

In [None]:
# Get Employer Reputation scores
ER_data = list()
for i in range(4, len(data), 8):
    ER_data.append(data[i].text)
# ER_data

In [None]:
# Get Citations scores
citations_data = list()
for i in range(5, len(data), 8):
    citations_data.append(data[i].text)
# citations_data

In [None]:
# Get locations of schools
def extract_location(driver):
    locations = driver.find_elements(By.XPATH, "//div[contains(@class, 'location')]")
    location_data = []
    for location in locations:
        location_text = location.text.strip()
        if location_text:
            location_data.append(location_text)
    return location_data

location_data = extract_location(driver)
# print(location_data)

#### Creating Dataframe

In [None]:
# Separating countries and cities from location data
city_list = []
country_list = []

for location in location_data:
    # Check if the location contains a comma
    if ',' in location:
        # Split the string at the comma
        split_location = location.split(',')

        # Remove leading and trailing whitespaces from the city and country
        city = split_location[0].strip()
        country = split_location[1].strip()

        city_list.append(city)
        country_list.append(country)
    else:
        city_list.append(location)
        country_list.append(None)

In [None]:
# Create dataframe
df_qs = pd.DataFrame()

# Populating data into dataframe
df_qs["University"] = schools_data
df_qs["QS Citations per Paper"] = citations_data
df_qs["QS Academic Reputation"] = AR_data
df_qs["QS Employer Reputation"] = ER_data

In [None]:
# Replace the values in the "Country" column
df_qs['Country'] = df_qs['Country'].replace("China", "China (Mainland)")
df_qs['Country'] = country_list
df_qs = df_qs[['University','Country', "QS Citations per Paper",
         "QS Academic Reputation", "QS Employer Reputation"]]

In [None]:
# View dataset
df_qs

### THE

#### Setting Up

In [None]:
# Import libraries

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException

import time
import pandas as pd
import pprint
import datetime as dt
year = dt.datetime.now().year

# Define wait times for webscrape to run
def wait():
    time.sleep(3)
def long_wait():
    time.sleep(5)


In [None]:
# Setting up chrome driver (website to scrape)
def setup(year):
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument('--ignore-ssl-errors')

    prefs = {"download.default_directory": ""}
    chrome_options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://www.timeshighereducation.com/world-university-rankings/{year}/subject-ranking/computer-science#!/length/-1/sort_by/rank/sort_order/asc/cols/stats")
    driver.maximize_window()
    long_wait()

    # Check if the page displays the error message
    if "Sorry, we couldn't find that page. Try searching instead." in driver.page_source:
        year -= 1
        driver.quit()
        return setup(year)  # Recursively call setup() with the decremented year

    return driver

In [None]:
# Button to click on "scores" data
def table_click_scores(driver):
  scores = driver.find_elements(By.XPATH, "//label[contains(@for, 'scores')]")
  scores[0].click()
  wait()

In [None]:
# Launch chromedriver
driver = setup(year)

In [None]:
# Show scores menu
table_click_scores(driver)

#### Extracting Data

In [None]:
# Get School names
df_name = driver.find_elements(By.XPATH, "//*[contains(@class, 'ranking-institution-title')]")
name_data = list()
for i in range(len(df_name)):
    name_data.append(df_name[i].text)

In [None]:
# Get Country
df_country = driver.find_elements(By.XPATH, "//div/span")
country_data = list()
for i in range(len(df_country)):
    country_data.append(df_country[i].text)

In [None]:
# Get Citation scores
df_citations = driver.find_elements(By.XPATH, "//td[contains(@class, 'scores citations-score')]")
citations_data = list()
for i in range(len(df_citations)):
    citations_data.append(df_citations[i].text)


In [None]:
# Get Research scores
df_research = driver.find_elements(By.XPATH, "//td[contains(@class, 'scores research-score')]")
research_data = list()
for i in range(len(df_research)):
    research_data.append(df_research[i].text)

In [None]:
# Get Teaching scores
df_teaching = driver.find_elements(By.XPATH, "//td[contains(@class, 'scores teaching-score')]")
teaching_data = list()
for i in range(len(df_teaching)):
    teaching_data.append(df_teaching[i].text)

#### Creating Dataframe

In [None]:
# Create dataframe
df_the = pd.DataFrame()

# Populating data into dataframe
df_the["University"] = name_data
df_the["Country"] = country_data
df_the["Citations"] = citations_data
df_the["Research"] = research_data
df_the["Teaching"] = teaching_data

In [None]:
# View dataset
df_the

### SH

#### Setting Up

In [None]:
# import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException

import time
import pandas as pd
import datetime as dt
year = dt.datetime.now().year

# define wait times for webscrape to run
def wait():
    time.sleep(3)
def long_wait():
    time.sleep(5)

In [None]:
# setting up chrome driver (website to scrape)
def setup(year):
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    prefs = {"download.default_directory": ""}
    chrome_options.add_experimental_option("prefs", prefs)
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://www.shanghairanking.com/rankings/gras/{year}/RS0210")
    driver.maximize_window()
    long_wait()
    
    # Check if the page is inaccessible
    if "Sorry, the page is inaccessible" in driver.page_source:
        year -= 1
        driver.quit()
        return setup(year)  # Recursively call setup() with the decremented year
    
    return driver

In [None]:
# Button to click "CNCI" scores
def click_cnci(driver):
    try:
        button = driver.find_element(By.XPATH, '//*[@id="content-box"]//th[5]//img')
        button.click()
        long_wait()
        cnci = driver.find_element(By.XPATH, '//li[text()="CNCI"]')
        cnci.click()
        wait()
    except Exception as e:
        print("cnci button not found", e)

In [None]:
# Button to click "TOP" scores
def click_top(driver):
    try:
        button = driver.find_element(By.XPATH, '//*[@id="content-box"]//th[5]//img')
        button.click()
        long_wait()
        top = driver.find_element(By.XPATH, '//li[text()="TOP"]')
        top.click()
        wait()
    except Exception as e:
        print("top button not found", e)

In [None]:
# run driver
driver = setup(year)

#### Extracting Data

In [None]:
# Button to go back to first page
back_to_first_page = driver.find_element(By.XPATH, '//li[@title="1"]')

In [None]:
# Get School names
def extract_names(driver):
    university_names = []
    while True:
        rows = driver.find_elements(By.XPATH, '//tr[@data-v-ae1ab4a8=""]')
        for row in rows:
            university_name_elements = row.find_elements(By.XPATH, './/span[@class="univ-name"]')
            for element in university_name_elements:
                university_name = element.text.strip()
                university_names.append(university_name)  # Append the university name to the list

        try:
            next_page_button = driver.find_element(By.XPATH, '//li[@title="下一页"]')
            if next_page_button.get_attribute('aria-disabled') == 'true':
                print("Next Page button is not clickable. Stopping loop.")
                break
            next_page_button.click()
            wait()
        except NoSuchElementException as e:
            print("No more pages available.", e)
            break
    
    return university_names

back_to_first_page.click()
university = extract_names(driver)

In [None]:
# Get CNCI scores
def extract_cnci(driver):
    cnci_numbers = []
    while True:
        rows = driver.find_elements(By.XPATH, '//tr[@data-v-ae1ab4a8=""]')
        for row in rows:
            td_elements = row.find_elements(By.XPATH, './/td')
            if len(td_elements) >= 5:
                cnci_element = td_elements[4]  # Select the fifth td element (which is the cnci number)
                cnci_number = cnci_element.text.strip()
                cnci_numbers.append(cnci_number)  # Append the CNCI number to the list

        try:
            next_page_button = driver.find_element(By.XPATH, '//li[@title="下一页"]')
            if next_page_button.get_attribute('aria-disabled') == 'true':
                print("Next Page button is not clickable. Stopping loop.")
                break
            next_page_button.click()
            wait()
        except NoSuchElementException as e:
            print("No more pages available.", e)
            break

    return cnci_numbers


back_to_first_page.click()
time.sleep(12)
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(3)
click_cnci(driver)
time.sleep(10)
cnci = extract_cnci(driver)


In [None]:
# Get TOP scores
def extract_top(driver):
    top_numbers = []
    while True:
        rows = driver.find_elements(By.XPATH, '//tr[@data-v-ae1ab4a8=""]')
        for row in rows:
            td_elements = row.find_elements(By.XPATH, './/td')
            if len(td_elements) >= 5:
                top_element = td_elements[4]  # Select the fifth td element
                top_number = top_element.text.strip()
                top_numbers.append(top_number)  # Append the top number to the list

        try:
            next_page_button = driver.find_element(By.XPATH, '//li[@title="下一页"]')
            if next_page_button.get_attribute('aria-disabled') == 'true':
                print("Next Page button is not clickable. Stopping loop.")
                break
            next_page_button.click()
            wait()
        except NoSuchElementException as e:
            print("No more pages available.", e)
            break

    return top_numbers

back_to_first_page.click()
time.sleep(12)
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(3)
click_top(driver)
time.sleep(10)
top = extract_top(driver)

#### Creating Dataframe

In [None]:
# Create dataframe
df_shanghai = pd.DataFrame()

# Populating data into dataframe
df_shanghai["University"] = university
df_shanghai["CNCI"] = cnci
df_shanghai["TOP"] = top

In [None]:
# View dataset
df_shanghai

### Merging

Merging the 3 CSV files from QS, THE, SH rankings

In [None]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

In [None]:
# import pandas library
import pandas as pd

In [None]:
# QS uni name cleaning
df_qs['University'] = df_qs['University'].str.lower().replace('\s\(.*\)', '', regex=True)
df_qs['University'] = df_qs['University'].str.lower().replace('[^a-z0-9 ]', '', regex=True)
df_qs

In [None]:
# THE uni name cleaning
df_the['University'] = df_the['University'].str.lower().replace('\s\(.*\)', '', regex=True)
df_the['University'] = df_the['University'].str.lower().replace('[^a-z0-9 ]', '', regex=True)
df_the

In [None]:
# SH uni name cleaning
df_shanghai['University'] = df_shanghai['University'].str.lower().replace('\s\(.*\)', '', regex=True)
df_shanghai['University'] = df_shanghai['University'].str.lower().replace('[^a-z0-9 ]', '', regex=True)
df_shanghai

In [None]:
from fuzzywuzzy import fuzz, process

# Define a function to find the best match for a given university name
def find_best_match(name, choices):
    best_match, score = process.extractOne(name, choices)
    if score >= 90:  # Adjust the threshold as needed
        return best_match
    else:
        return name

# Get unique university names from both dataframes
qs_universities = df_qs['University'].unique()
the_universities = df_the['University'].unique()

# Create a master list of all unique university names
all_universities = list(set(qs_universities))


# Standardize university names in df_the
df_the['New University'] = df_the.apply(lambda x: find_best_match(x['University'], all_universities), axis=1)

# Merge the dataframes based on standardized university names
merged_df = pd.merge(df_qs, df_the, on=['University', 'Country'], how='outer')

# Select the desired columns
merged_df = merged_df[['University', 'Country', 'QS Citations per Paper', 'QS Academic Reputation','QS Employer Reputation',
                       'Citations', 'Research', 'Teaching']]

# Display the merged dataframe
merged_df

In [None]:
# Get unique university names from both dataframes
shanghai_universities = df_shanghai['University'].unique()
merged_universities = merged_df['University'].unique()

# Create a master list of all unique university names
all_universities = list(set(merged_universities))

# Standardize university names in df_the
df_shanghai['new University'] = df_shanghai.apply(lambda x: find_best_match(x['University'], all_universities), axis=1)

# Merge the dataframes based on standardized university names
new_merged_df = pd.merge(merged_df, df_shanghai, on=['University'], how='outer')

# # Select the desired columns
new_merged_df = new_merged_df[['University', 'Country', 'QS Citations per Paper', 'QS Academic Reputation','QS Employer Reputation',
                       'Citations', 'Research', 'Teaching', 'CNCI', 'TOP']]

# Display the merged dataframe
new_merged_df

In [None]:
# drop duplicates
new_merged_df.drop_duplicates(inplace=True)
new_merged_df

In [None]:
# Fill NaN values with 0 
country_column = new_merged_df['Country']
new_merged_df.fillna(0, inplace=True)
new_merged_df[['University', 'Country', 'QS Citations per Paper', "Citations", 'CNCI', 'QS Academic Reputation',
              'QS Employer Reputation', 'Research', 'Teaching', 'TOP']]
new_merged_df

In [None]:
# ensure scores are of type int
new_merged_df['QS Citations per Paper'] = pd.to_numeric(new_merged_df['QS Citations per Paper'], errors='coerce')
new_merged_df['Citations'] = pd.to_numeric(new_merged_df['Citations'], errors='coerce')
new_merged_df['CNCI'] = pd.to_numeric(new_merged_df['CNCI'], errors='coerce')

# Calculate the "final citation score" for each row based on the specified logic
def calculate_final_score_citations(row):
    qs_citations_per_paper = row['QS Citations per Paper']
    citations = row['Citations']
    cnci = row['CNCI']

    if qs_citations_per_paper != 0 and citations != 0 and cnci != 0:
        final_score = (((qs_citations_per_paper + citations + cnci) / 3) / 100) * 0.8 + 0.2
    elif (qs_citations_per_paper != 0 and citations != 0) or (qs_citations_per_paper != 0 and cnci != 0) or (citations != 0 and cnci != 0):
        final_score = (((qs_citations_per_paper + citations + cnci) / 2) / 100) * 0.8 + 0.1
    elif qs_citations_per_paper != 0 or citations != 0 or cnci != 0:
        final_score = (max(qs_citations_per_paper, citations, cnci) / 100) * 0.8
    else:
        final_score = 0

    return (final_score*100)

# Apply the calculation to each row and create a new 'final citation score' column
new_merged_df['final citation score'] = new_merged_df.apply(calculate_final_score_citations, axis=1)

# Display the updated DataFrame
new_merged_df

In [None]:
#  Calculate the "final reputation score" for each row based on the specified logic
def calculate_final_score_rep(row):
    cols = ['TOP', 'QS Academic Reputation', 'QS Employer Reputation', 'Research', 'Teaching']
    non_zero_cols = [col for col in cols if row[col] != 0]
    non_zero_count = len(non_zero_cols)

    # Convert relevant columns to numeric
    numeric_cols = row[non_zero_cols].apply(pd.to_numeric, errors='coerce')

    if non_zero_count == 5:
        final_score = (((numeric_cols.sum() / 5) / 100) * 0.8) + 0.2
    elif non_zero_count == 4:
        final_score = (((numeric_cols.sum() / 4) / 100) * 0.8) + 0.15
    elif non_zero_count == 3:
        final_score = (((numeric_cols.sum() / 3) / 100) * 0.8) + 0.1
    elif non_zero_count == 2:
        final_score = (((numeric_cols.sum() / 2) / 100) * 0.8) + 0.05
    elif non_zero_count == 1:
        final_score = (numeric_cols[non_zero_cols[0]] / 100) * 0.8
    else:
        final_score = 0

    return (final_score * 100)

# Apply the calculation to each row and create a new 'final reputation score' column
new_merged_df['final reputation score'] = new_merged_df.apply(calculate_final_score_rep, axis=1)

# Display the updated DataFrame
new_merged_df

In [None]:
# Calculate overall score based on weightage for citations and reputation
new_merged_df['overall score'] = 0.2 * new_merged_df['final citation score'] + 0.8 * new_merged_df['final reputation score']
new_merged_df.sort_values(by='overall score', ascending=False, inplace=True)
new_merged_df.reset_index(drop=True, inplace=True)
new_merged_df

In [None]:
# Download to csv
new_merged_df.to_csv('FINAL_ranking_list.csv')