In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
# from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import pprint
import datetime as dt
year = dt.datetime.now().year
def wait():
    time.sleep(3)
def long_wait():
    time.sleep(5)
def close_cookie(driver):
    try:
        close_btn = driver.find_element(By.XPATH, "//div[@class='eu-cookie-compliance-buttons']")
        close_btn.click()
        wait()
    except:
        print("Cookie button not found")


In [2]:
def setup(year):
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    prefs = {"download.default_directory": ""}
    chrome_options.add_experimental_option("prefs", prefs)
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://www.topuniversities.com/university-rankings/university-subject-rankings/{year}/computer-science-information-systems?&tab=indicators&sort_by=overallscore&order_by=desc")
    driver.maximize_window()
    long_wait()
    close_cookie(driver)
    
    # Check if the page contains the "Coming Soon!" error message
    if 'Coming Soon!' in driver.page_source:
        year -= 1
        driver.quit()
        return setup(year)  # Recursively call setup() with the decremented year
    
    return driver


In [3]:
def click_load_more(driver):
    while True:
        try:
            # Check if the "No data found" message is present
            time.sleep(5)
            no_data_msg = driver.find_element(By.XPATH, "//div[contains(text(), 'No data found on applied filters')]")
            break  # Stop clicking the "Load More" button if the message is found
        except NoSuchElementException:
            pass

        try:
            load_more_btn = driver.find_element(By.XPATH, "//button[contains(@class, 'loadmorebutton') and contains(text(), 'Load More')]")
            driver.execute_script("arguments[0].scrollIntoView();", load_more_btn)
            driver.execute_script("arguments[0].click();", load_more_btn)
            wait()
        except NoSuchElementException:
            # If the "Load More" button is not found, break the loop
            break

driver = setup(year)
click_load_more(driver)

In [4]:
def table_click_right(driver):
  right_arrow = driver.find_elements(By.XPATH, "//span[@direction='right']")
  right_arrow[0].click()
  wait()
    
def extract_location(driver):
    locations = driver.find_elements(By.XPATH, "//div[contains(@class, 'location')]")
    location_data = []
    for location in locations:
        location_text = location.text.strip()
        if location_text:
            location_data.append(location_text)
    return location_data



######################################
# setting up dataframe of results #
######################################
df_columns = ["Country", "University", "QS Citations per Paper"]
df = pd.DataFrame(columns=df_columns)
######################################
# running webscrape #
######################################

time.sleep(20)

In [5]:
data = driver.find_elements(By.XPATH, "//*[@class='td-wrap-in']")
for i in range(len(data)):
    print(i, data[i].text)

0 
1 Massachusetts Institute of Technology (MIT)
2 94.5
3 91
4 98.2
5 
6 
7 
8 
9 Carnegie Mellon University
10 93.5
11 100
12 82.3
13 
14 
15 
16 
17 Stanford University
18 93
19 88.5
20 96.3
21 
22 
23 
24 
25 University of California, Berkeley (UCB)
26 89.6
27 85.4
28 90
29 
30 
31 
32 
33 University of Oxford
34 89.4
35 82.1
36 96.7
37 
38 
39 
40 
41 National University of Singapore (NUS)
42 88.5
43 84.4
44 92.4
45 
46 
47 
48 
49 University of Cambridge
50 88.3
51 82.7
52 96
53 
54 
55 
56 
57 Harvard University
58 88.2
59 80.2
60 100
61 
62 
63 
64 
65 ETH Zurich
66 87.2
67 82.8
68 90
69 
70 
71 
72 
73 EPFL
74 86
75 85.3
76 88.3
77 
78 
79 
80 
81 Nanyang Technological University, Singapore (NTU Singapore)
82 85.2
83 80.1
84 87.8
85 
86 
87 
88 
89 University of Toronto
90 84.6
91 82.8
92 84.4
93 
94 
95 
96 
97 Princeton University
98 83.7
99 79.8
100 85.1
101 
102 
103 
104 
105 University of Washington
106 82.9
107 82
108 75.5
109 
110 
111 
112 
113 Tsinghua University
114 

In [6]:
schools_data = list()
for i in range(1, len(data), 8):
    schools_data.append(data[i].text)
schools_data

['Massachusetts Institute of Technology (MIT)',
 'Carnegie Mellon University',
 'Stanford University',
 'University of California, Berkeley (UCB)',
 'University of Oxford',
 'National University of Singapore (NUS)',
 'University of Cambridge',
 'Harvard University',
 'ETH Zurich',
 'EPFL',
 'Nanyang Technological University, Singapore (NTU Singapore)',
 'University of Toronto',
 'Princeton University',
 'University of Washington',
 'Tsinghua University',
 'Tsinghua University',
 'Cornell University',
 'University of California, Los Angeles (UCLA)',
 'Peking University',
 'The University of Edinburgh',
 'University of British Columbia',
 'University of Waterloo',
 'Columbia University',
 'UCL',
 'University of Illinois at Urbana-Champaign',
 'New York University (NYU)',
 'Shanghai Jiao Tong University',
 'Georgia Institute of Technology',
 'Technical University of Munich',
 'KAIST - Korea Advanced Institute of Science & Technology',
 'Institut Polytechnique de Paris',
 'Seoul National U

In [7]:
table_click_right(driver)

In [8]:
AR_data = list()
for i in range(3, len(data), 8):
    AR_data.append(data[i].text)
AR_data

['91',
 '100',
 '88.5',
 '85.4',
 '82.1',
 '84.4',
 '82.7',
 '80.2',
 '82.8',
 '85.3',
 '80.1',
 '82.8',
 '79.8',
 '82',
 '78.8',
 '78.8',
 '76.3',
 '75.8',
 '76.7',
 '79.9',
 '73.4',
 '76',
 '74.2',
 '75.4',
 '74',
 '75.7',
 '71.4',
 '69.6',
 '76.5',
 '75.8',
 '68',
 '72.9',
 '70.1',
 '74.3',
 '67.3',
 '67.2',
 '67.5',
 '66.9',
 '67.6',
 '73.7',
 '64.3',
 '68.3',
 '69.2',
 '69',
 '66',
 '66.2',
 '68.1',
 '68.9',
 '67.9',
 '68.1',
 '70.8',
 '65.7',
 '72.3',
 '64.5',
 '65.3',
 '67.8',
 '63.5',
 '63',
 '67.9',
 '68',
 '68.1',
 '69.6',
 '66.4',
 '69.6',
 '61.1',
 '67.8',
 '65.1',
 '61.6',
 '67.1',
 '68.8',
 '67.7',
 '59.4',
 '60.8',
 '65',
 '62.9',
 '65',
 '61',
 '66.9',
 '64.6',
 '68.2',
 '61.8',
 '62.9',
 '64.4',
 '67.2',
 '58.9',
 '62.4',
 '60.4',
 '61.6',
 '66.8',
 '64.6',
 '59.4',
 '61.5',
 '69.9',
 '61.6',
 '56.7',
 '62.6',
 '62.9',
 '57.7',
 '53.1',
 '56',
 '56.9',
 '63.7',
 '63.8',
 '63.7',
 '61.5',
 '54.8',
 '57.4',
 '58.3',
 '59.1',
 '62.7',
 '63.9',
 '60.6',
 '57.2',
 '66.2',
 

In [9]:
ER_data = list()
for i in range(4, len(data), 8):
    ER_data.append(data[i].text)
ER_data

['98.2',
 '82.3',
 '96.3',
 '90',
 '96.7',
 '92.4',
 '96',
 '100',
 '90',
 '88.3',
 '87.8',
 '84.4',
 '85.1',
 '75.5',
 '83',
 '83',
 '81.5',
 '86.9',
 '80.9',
 '73.5',
 '83.1',
 '79',
 '83.1',
 '76',
 '76.7',
 '77',
 '81.9',
 '78.6',
 '72.9',
 '79.4',
 '86.9',
 '82',
 '81.4',
 '73.7',
 '82.4',
 '86.4',
 '87.1',
 '80.4',
 '78.1',
 '72.9',
 '85.6',
 '78.1',
 '76.4',
 '73.6',
 '75.2',
 '82.7',
 '71.5',
 '78.4',
 '84.7',
 '70.1',
 '71.4',
 '79.8',
 '64.8',
 '78.9',
 '71.4',
 '74.2',
 '76.6',
 '74.9',
 '69.1',
 '79.2',
 '73.4',
 '70.1',
 '74.6',
 '91.9',
 '75.3',
 '85',
 '81.9',
 '75.5',
 '72.6',
 '69.3',
 '69.6',
 '72',
 '75.5',
 '74.5',
 '64.8',
 '74.5',
 '72.2',
 '75.7',
 '78',
 '73.1',
 '65.7',
 '67.8',
 '75.1',
 '57.6',
 '72.1',
 '83.1',
 '80.2',
 '71',
 '61.1',
 '68.2',
 '67.6',
 '73',
 '61.7',
 '75.5',
 '69',
 '63.1',
 '77.5',
 '71.2',
 '86.7',
 '73.5',
 '74.6',
 '78.1',
 '64.6',
 '65.8',
 '75.2',
 '71.6',
 '65.5',
 '69.5',
 '73.2',
 '63.9',
 '61',
 '70.4',
 '63.9',
 '60.5',
 '68.4'

In [10]:
citations_data = list()
for i in range(5, len(data), 8):
    citations_data.append(data[i].text)
citations_data

['93.7',
 '95.3',
 '99.9',
 '100',
 '94.2',
 '89.7',
 '89.4',
 '90.4',
 '91.7',
 '87.6',
 '87.3',
 '91.5',
 '93.8',
 '96.9',
 '86.4',
 '86.4',
 '97.4',
 '86.5',
 '85.9',
 '88.1',
 '88.9',
 '83.4',
 '90.9',
 '90.5',
 '86',
 '92.6',
 '80.5',
 '87.1',
 '81.3',
 '80.1',
 '78',
 '81.9',
 '75.7',
 '89.8',
 '86.6',
 '81.2',
 '84.8',
 '80.6',
 '88',
 '90',
 '75.4',
 '84.4',
 '87.6',
 '89.5',
 '88.5',
 '82.1',
 '92.5',
 '77.9',
 '72.4',
 '87',
 '81',
 '87.4',
 '95.2',
 '78.7',
 '90.2',
 '86.7',
 '83',
 '88.1',
 '87.8',
 '81.3',
 '86.6',
 '80.3',
 '83.2',
 '60.2',
 '82',
 '69.3',
 '74.7',
 '79.1',
 '76.5',
 '78.2',
 '76.7',
 '90.2',
 '80.2',
 '81.1',
 '86.7',
 '81.1',
 '84.3',
 '83.2',
 '66.4',
 '74.4',
 '86.9',
 '75.9',
 '78.1',
 '83.6',
 '88.3',
 '70.2',
 '71',
 '78.1',
 '83.4',
 '74.7',
 '88.7',
 '77.6',
 '83.1',
 '74.7',
 '82.6',
 '85.4',
 '74.4',
 '87.9',
 '72.5',
 '80.3',
 '72.9',
 '69.6',
 '84.7',
 '75',
 '71.2',
 '83.3',
 '87.6',
 '89.6',
 '77.6',
 '87',
 '82.8',
 '77.3',
 '94.6',
 '82.1

In [11]:
location_data = extract_location(driver)
print(location_data)

['Cambridge, United States', 'Pittsburgh, United States', 'Stanford, United States', 'Berkeley, United States', 'Oxford, United Kingdom', 'Singapore, Singapore', 'Cambridge, United Kingdom', 'Cambridge, United States', 'Zürich, Switzerland', 'Lausanne, Switzerland', 'Singapore, Singapore', 'Toronto, Canada', 'Princeton, United States', 'Seattle, United States', 'Beijing, China (Mainland)', 'Beijing, China (Mainland)', 'Ithaca, United States', 'Los Angeles, United States', 'Beijing, China (Mainland)', 'Edinburgh, United Kingdom', 'Vancouver, Canada', 'Waterloo, Canada', 'New York City, United States', 'London, United Kingdom', 'Champaign, United States', 'New York City, United States', 'Shanghai, China (Mainland)', 'Atlanta, United States', 'Munich, Germany', 'Daejeon, South Korea', 'Palaiseau Cedex, France', 'Seoul, South Korea', 'Milan, Italy', 'Amsterdam, Netherlands', 'Montreal, Canada', 'Pasadena, United States', 'New Haven, United States', 'Hangzhou, China (Mainland)', 'Austin, Un

In [12]:
df["Country"] = location_data
df["University"] = schools_data
df["QS Citations per Paper"] = citations_data
df["QS Academic Reputation"] = AR_data
df["QS Employer Reputation"] = ER_data

In [13]:
df

Unnamed: 0,Country,University,QS Citations per Paper,QS Academic Reputation,QS Employer Reputation
0,"Cambridge, United States",Massachusetts Institute of Technology (MIT),93.7,91,98.2
1,"Pittsburgh, United States",Carnegie Mellon University,95.3,100,82.3
2,"Stanford, United States",Stanford University,99.9,88.5,96.3
3,"Berkeley, United States","University of California, Berkeley (UCB)",100,85.4,90
4,"Oxford, United Kingdom",University of Oxford,94.2,82.1,96.7
...,...,...,...,...,...
681,"Porto Alegre, Brazil",Pontifícia Universidade Católica do Rio Grande...,61.2,41.3,52.5
682,"Chania, Greece",Technical University of Crete,65.1,44,45.1
683,"Klagenfurt, Austria",University of Klagenfurt,70.9,46,37.7
684,"Lyon, France",Université Lumière Lyon 2,74.2,42.2,37


In [14]:
city_list = []
country_list = []

for location in location_data:
    # Check if the location contains a comma
    if ',' in location:
        # Split the string at the comma
        split_location = location.split(',')

        # Remove leading and trailing whitespaces from the city and country
        city = split_location[0].strip()
        country = split_location[1].strip()

        city_list.append(city)
        country_list.append(country)
    else:
        city_list.append(location)
        country_list.append(None)
print(city_list)
print(country_list)

['Cambridge', 'Pittsburgh', 'Stanford', 'Berkeley', 'Oxford', 'Singapore', 'Cambridge', 'Cambridge', 'Zürich', 'Lausanne', 'Singapore', 'Toronto', 'Princeton', 'Seattle', 'Beijing', 'Beijing', 'Ithaca', 'Los Angeles', 'Beijing', 'Edinburgh', 'Vancouver', 'Waterloo', 'New York City', 'London', 'Champaign', 'New York City', 'Shanghai', 'Atlanta', 'Munich', 'Daejeon', 'Palaiseau Cedex', 'Seoul', 'Milan', 'Amsterdam', 'Montreal', 'Pasadena', 'New Haven', 'Hangzhou', 'Austin', 'Hong Kong', 'Gif-sur-Yvette', 'Parkville', 'Philadelphia', 'San Diego', 'Ann Arbor', 'Paris', 'Montreal', 'Paris', 'Tokyo', 'Jeddah', 'Leuven', 'Chicago', 'Hong Kong SAR', 'Delft', 'Los Angeles', 'Canberra', 'Sydney', 'Sydney', 'College Park', 'Shanghai', 'Hong Kong', 'Stockholm', 'London', 'Moscow', 'Manchester', 'Mumbai', 'New Delhi', 'West Lafayette', 'Eindhoven', 'Berlin', 'Rome', 'Baltimore', 'Melbourne', 'Seoul', 'Haymarket', 'Seoul', 'Coventry', 'Pohang', 'St. Petersburg', 'Taipei', 'Kuala Lumpur', 'Barcelona'

In [15]:
# Replace the values in the "Country" column
df['Country'] = df['Country'].replace("China", "China (Mainland)")
df['Country'] = country_list
df = df[['University','Country', "QS Citations per Paper", 
         "QS Academic Reputation", "QS Employer Reputation"]]
df_qs = df

In [16]:
df_qs.to_excel('QS.xlsx')

THE

In [17]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException

import time
import pandas as pd
import numpy as np
import pprint

def wait():
    time.sleep(3)

def long_wait():
    time.sleep(5)

# Set up chromedriver
def setup(year):
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument('--ignore-ssl-errors')

    prefs = {"download.default_directory": ""}
    chrome_options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://www.timeshighereducation.com/world-university-rankings/{year}/subject-ranking/computer-science#!/length/-1/sort_by/rank/sort_order/asc/cols/stats")
    driver.maximize_window()
    long_wait()

    # Check if the page displays the error message
    if "Sorry, we couldn't find that page. Try searching instead." in driver.page_source:
        year -= 1
        driver.quit()
        return setup(year)  # Recursively call setup() with the decremented year

    return driver

# Click on "Scores" button
def table_click_scores(driver):
  scores = driver.find_elements(By.XPATH, "//label[contains(@for, 'scores')]")
  scores[0].click()
  wait()

# Launch chromedriver
driver = setup(year)

# Show scores menu
table_click_scores(driver)


# extract function not even needed
def extract(df, driver):
  table_data = driver.find_element(By.XPATH, "//div[contains(@class, 'pane-content')]")
  table_rows = table_data.find_elements(By.XPATH, "//tr[contains(@role, 'row')]")
  for i in range(2, len(table_rows) - 1, 2):
    row = table_rows[i]
    data = row.find_element(By.XPATH, "/")
    for d in data:
      print(d.text)

######################################
# setting up dataframe of results #
######################################
df_columns = ["University", "Country", "Citations","Research", "Teaching"]
df = pd.DataFrame(columns=df_columns)
######################################
# running webscrape #
######################################

time.sleep(20)

# Scraping Rank, University Name & Country
df_rank = driver.find_elements(By.XPATH, "//td[contains(@class, 'rank sorting_1 sorting_2')]")
rank_data = list()
for i in range(len(df_rank)):
    rank_data.append(df_rank[i].text)


df_name = driver.find_elements(By.XPATH, "//*[contains(@class, 'ranking-institution-title')]")
name_data = list()
for i in range(len(df_name)):
    name_data.append(df_name[i].text)

df_country = driver.find_elements(By.XPATH, "//div/span")
country_data = list()
for i in range(len(df_country)):
    country_data.append(df_country[i].text)

# # Scraping all the Scores
# df_overall = driver.find_elements(By.XPATH, "//td[contains(@class, 'scores overall-score')]")
# overall_data = list()
# for i in range(len(df_overall)):
#     overall_data.append(df_overall[i].text)

df_citations = driver.find_elements(By.XPATH, "//td[contains(@class, 'scores citations-score')]")
citations_data = list()
for i in range(len(df_citations)):
    citations_data.append(df_citations[i].text)

# df_industry_income = driver.find_elements(By.XPATH, "//td[contains(@class, 'scores industry_income-score')]")
# industry_income_data = list()
# for i in range(len(df_industry_income)):
#     industry_income_data.append(df_industry_income[i].text)

# df_international_outlook = driver.find_elements(By.XPATH, "//td[contains(@class, 'scores international_outlook-score')]")
# international_outlook_data = list()
# for i in range(len(df_international_outlook)):
#     international_outlook_data.append(df_international_outlook[i].text)

df_research = driver.find_elements(By.XPATH, "//td[contains(@class, 'scores research-score')]")
research_data = list()
for i in range(len(df_research)):
    research_data.append(df_research[i].text)

df_teaching = driver.find_elements(By.XPATH, "//td[contains(@class, 'scores teaching-score')]")
teaching_data = list()
for i in range(len(df_teaching)):
    teaching_data.append(df_teaching[i].text)

# Adding data to Dataframe
# df["Rank"] = rank_data
df["University"] = name_data
df["Country"] = country_data
# df["Overall"] = overall_data
df["Citations"] = citations_data
# df["Industry Income"] = industry_income_data
# df["International Outlook"] = international_outlook_data
df["Research"] = research_data
df["Teaching"] = teaching_data

print(df)
df_the = df
# Exporting Dataframe as excel sheet
df_the.to_excel('THE_scrape.xlsx')

                                University         Country Citations Research  \
0                     University of Oxford  United Kingdom      99.9     92.8   
1    Massachusetts Institute of Technology   United States      99.0     97.1   
2                      Stanford University   United States     100.0     96.3   
3                               ETH Zurich     Switzerland      98.8     95.7   
4               Carnegie Mellon University   United States      99.6     93.4   
..                                     ...             ...       ...      ...   
969                   Yamaguchi University           Japan      12.1      8.8   
970                     Yanshan University           China      20.2      8.9   
971           Yokohama National University           Japan      20.6     19.3   
972                     Yuan Ze University          Taiwan      28.3     10.7   
973                   University of Žilina        Slovakia      25.0     10.9   

    Teaching  
0       92.4

SH

In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
import time
import pandas as pd
import pprint
import itertools

def wait():
    time.sleep(3)
def long_wait():
    time.sleep(5)

        
def click_cnci(driver):
    try:
        button = driver.find_element(By.XPATH, '//*[@id="content-box"]//th[5]//img')
        button.click()
        long_wait()
        cnci = driver.find_element(By.XPATH, '//li[text()="CNCI"]')
        cnci.click()
        wait()
    except Exception as e:
        print("cnci button not found", e)

def click_top(driver):
    try:
        button = driver.find_element(By.XPATH, '//*[@id="content-box"]//th[5]//img')
        button.click()
        long_wait()
        top = driver.find_element(By.XPATH, '//li[text()="TOP"]')
        top.click()
        wait()
    except Exception as e:
        print("top button not found", e)
        
# def click_award(driver):
#     try:
#         button = driver.find_element(By.XPATH, '//*[@id="content-box"]//th[5]//img')
#         button.click()
#         long_wait()
#         top = driver.find_element(By.XPATH, '//li[text()="AWARD"]')
#         top.click()
#         wait()
#     except Exception as e:
#         print("award button not found", e) 

In [19]:
def setup(year):
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    prefs = {"download.default_directory": ""}
    chrome_options.add_experimental_option("prefs", prefs)
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://www.shanghairanking.com/rankings/gras/{year}/RS0210")
    driver.maximize_window()
    long_wait()
    
    # Check if the page is inaccessible
    if "Sorry, the page is inaccessible" in driver.page_source:
        year -= 1
        driver.quit()
        return setup(year)  # Recursively call setup() with the decremented year
    
    return driver

df_columns = ["University", "CNCI", "TOP"]
df = pd.DataFrame(columns=df_columns)
driver = setup(year)


In [20]:
def extract_names(driver):
    university_names = []
    while True:
        rows = driver.find_elements(By.XPATH, '//tr[@data-v-ae1ab4a8=""]')
        for row in rows:
            university_name_elements = row.find_elements(By.XPATH, './/span[@class="univ-name"]')
            for element in university_name_elements:
                university_name = element.text.strip()
                university_names.append(university_name)  # Append the university name to the list

        try:
            next_page_button = driver.find_element(By.XPATH, '//li[@title="下一页"]')
            if next_page_button.get_attribute('aria-disabled') == 'true':
                print("Next Page button is not clickable. Stopping loop.")
                break
            next_page_button.click()
            wait()
        except NoSuchElementException as e:
            print("No more pages available.", e)
            break
    
    return university_names
back_to_first_page = driver.find_element(By.XPATH, '//li[@title="1"]')
back_to_first_page.click()
university = extract_names(driver)
print(university)

Next Page button is not clickable. Stopping loop.
['Massachusetts Institute of Technology (MIT)', 'Stanford University', 'University of California, Berkeley', 'Carnegie Mellon University', 'Tsinghua University', 'Harvard University', 'ETH Zurich', 'Nanyang Technological University', 'University of Oxford', 'Princeton University', 'Columbia University', 'University of Electronic Science and Technology of China', 'National University of Singapore', 'University of California, Los Angeles', 'University of Toronto', 'Shanghai Jiao Tong University', 'University of Technology Sydney', 'Zhejiang University', 'Peking University', 'The Chinese University of Hong Kong', 'Huazhong University of Science and Technology', 'Cornell University', 'Beihang University', 'Harbin Institute of Technology', 'University of Southern California', 'Georgia Institute of Technology', 'Xidian University', 'New York University', 'University of Sydney', 'University of Oslo', 'The University of Edinburgh', 'University 

In [21]:
def extract_cnci(driver):
    cnci_numbers = []
    while True:
        rows = driver.find_elements(By.XPATH, '//tr[@data-v-ae1ab4a8=""]')
        for row in rows:
            td_elements = row.find_elements(By.XPATH, './/td')
            if len(td_elements) >= 5:
                cnci_element = td_elements[4]  # Select the fifth td element (which is the cnci number)
                cnci_number = cnci_element.text.strip()
                cnci_numbers.append(cnci_number)  # Append the CNCI number to the list

        try:
            next_page_button = driver.find_element(By.XPATH, '//li[@title="下一页"]')
            if next_page_button.get_attribute('aria-disabled') == 'true':
                print("Next Page button is not clickable. Stopping loop.")
                break
            next_page_button.click()
            wait()
        except NoSuchElementException as e:
            print("No more pages available.", e)
            break

    return cnci_numbers
time.sleep(10)
back_to_first_page = driver.find_element(By.XPATH, '//li[@title="1"]')
time.sleep(10)
back_to_first_page.click()
time.sleep(12)
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(3)
click_cnci(driver)
time.sleep(10)
cnci = extract_cnci(driver)
print(cnci)


Next Page button is not clickable. Stopping loop.
['81.1', '87.2', '83.5', '73.7', '74.7', '81.6', '79.9', '85.5', '72.2', '86.2', '87.5', '86.1', '87.1', '67.5', '66.6', '71.0', '90.1', '71.0', '76.6', '89.8', '82.2', '73.4', '72.4', '80.5', '75.3', '79.5', '73.6', '79.1', '90.6', '100.0', '72.1', '75.0', '83.0', '82.5', '69.6', '84.3', '81.4', '82.6', '77.8', '77.9', '76.5', '77.4', '78.8', '84.7', '71.3', '78.5', '84.2', '77.9', '67.9', '95.9', '81.6', '75.9', '88.4', '93.3', '93.9', '76.3', '74.4', '62.2', '72.7', '69.8', '77.9', '77.5', '72.0', '75.4', '79.0', '75.4', '82.5', '70.9', '84.5', '73.5', '70.1', '73.7', '74.3', '60.0', '74.9', '76.9', '82.4', '100.0', '89.9', '87.0', '59.1', '85.1', '75.4', '86.1', '66.3', '74.6', '81.5', '87.1', '100.0', '100.0', '82.0', '79.4', '81.8', '74.5', '63.6', '90.2', '74.2', '85.3', '81.6', '76.1', '67.6', '95.3', '97.9', '88.3', '70.1', '69.6', '73.9', '66.9', '74.0', '79.8', '79.2', '80.6', '72.8', '71.2', '76.9', '70.4', '66.0', '72.2', '

In [22]:
def extract_top(driver):
    top_numbers = []
    while True:
        rows = driver.find_elements(By.XPATH, '//tr[@data-v-ae1ab4a8=""]')
        for row in rows:
            td_elements = row.find_elements(By.XPATH, './/td')
            if len(td_elements) >= 5:
                top_element = td_elements[4]  # Select the fifth td element
                top_number = top_element.text.strip()
                top_numbers.append(top_number)  # Append the top number to the list

        try:
            next_page_button = driver.find_element(By.XPATH, '//li[@title="下一页"]')
            if next_page_button.get_attribute('aria-disabled') == 'true':
                print("Next Page button is not clickable. Stopping loop.")
                break
            next_page_button.click()
            wait()
        except NoSuchElementException as e:
            print("No more pages available.", e)
            break

    return top_numbers
time.sleep(10)
back_to_first_page = driver.find_element(By.XPATH, '//li[@title="1"]')
time.sleep(10)
back_to_first_page.click()
time.sleep(12)
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(3)
click_top(driver)
time.sleep(10)
top = extract_top(driver)
print(top)

Next Page button is not clickable. Stopping loop.
['100.0', '85.5', '82.9', '99.8', '81.3', '54.0', '72.8', '54.4', '63.9', '57.8', '50.3', '34.8', '56.0', '53.8', '57.2', '52.4', '47.0', '50.6', '63.7', '55.4', '29.6', '61.9', '42.8', '31.8', '53.7', '66.9', '32.3', '48.1', '44.5', '12.1', '41.2', '50.1', '33.6', '31.1', '56.9', '49.5', '67.3', '46.9', '40.7', '21.9', '33.6', '31.3', '26.7', '27.0', '28.8', '27.2', '46.4', '28.4', '60.5', '33.2', '34.0', '30.5', '8.9', '42.5', '4.0', '44.3', '27.3', '29.7', '12.7', '41.1', '21.9', '27.6', '50.2', '44.0', '32.4', '33.4', '36.0', '53.4', '44.0', '29.3', '61.7', '48.3', '43.0', '32.9', '29.7', '36.4', '17.6', '16.2', '13.3', '6.1', '50.8', '26.8', '33.2', '12.7', '43.4', '46.0', '14.7', '28.0', '12.1', '18.4', '37.3', '32.2', '30.0', '19.4', '29.7', '14.7', '48.5', '21.3', '31.0', '25.3', '16.0', '27.7', '7.2', '9.4', '11.7', '28.0', '44.8', '34.1', '18.0', '9.7', '28.4', '20.8', '32.6', '34.7', '29.4', '32.8', '19.0', '12.1', '18.2', '3

In [23]:
# Append the data to the DataFrame
df["University"] = university
df["CNCI"] = cnci
df["TOP"] = top
df_shanghai = df

# Print the DataFrame
print(df_shanghai)
df_shanghai.to_excel('shanghai.xlsx', index=False)

                                      University  CNCI    TOP
0    Massachusetts Institute of Technology (MIT)  81.1  100.0
1                            Stanford University  87.2   85.5
2             University of California, Berkeley  83.5   82.9
3                     Carnegie Mellon University  73.7   99.8
4                            Tsinghua University  74.7   81.3
..                                           ...   ...    ...
495             Virginia Commonwealth University  72.9    7.6
496             Vrije Universiteit Brussel (VUB)  68.1   11.0
497             Wageningen University & Research  81.1    4.0
498                   Zhejiang Normal University  83.0    6.9
499                         Zhengzhou University  68.5    7.9

[500 rows x 3 columns]


Merging

In [24]:
!pip install fuzzywuzzy
!pip install python-Levenshtein



In [41]:
df_qs['University'] = df_qs['University'].str.strip()
df_the['University'] = df_the['University'].str.strip()
df_shanghai['University'] = df_shanghai['University'].str.strip()
# Convert university names to lowercase and remove special characters
df_qs['University'] = df_qs['University'].str.lower().replace('[^a-z0-9 ]', '', regex=True)
df_the['University'] = df_the['University'].str.lower().replace('[^a-z0-9 ]', '', regex=True)
# Convert university names to lowercase and remove special characters
df_shanghai['University'] = df_shanghai['University'].str.lower().replace('[^a-z0-9 ]', '', regex=True)

In [42]:
from fuzzywuzzy import fuzz, process
import pandas as pd

# Define a function to find the best match for a given university name
def find_best_match(name, choices):
    best_match, score = process.extractOne(name, choices)
    if score >= 95:  # Adjust the threshold as needed
        return best_match
    else:
        return name

# Get unique university names from both dataframes
qs_universities = df_qs['University'].unique()
the_universities = df_the['University'].unique()

# Create a master list of all unique university names
all_universities = list(set(qs_universities))


# Standardize university names in df_the
df_the['New University'] = df_the.apply(lambda x: find_best_match(x['University'], all_universities), axis=1)

# Merge the dataframes based on standardized university names
merged_df = pd.merge(df_qs, df_the, on=['University', 'Country'], how='outer')

# Select the desired columns
merged_df = merged_df[['University', 'Country', 'QS Citations per Paper', 'QS Academic Reputation','QS Employer Reputation',
                       'Citations', 'Research', 'Teaching']]

# Display the merged dataframe
print(merged_df)

                                     University         Country  \
0     massachusetts institute of technology mit   United States   
1                    carnegie mellon university   United States   
2                           stanford university   United States   
3         university of california berkeley ucb   United States   
4                          university of oxford  United Kingdom   
...                                         ...             ...   
1359                       yamaguchi university           Japan   
1360                         yanshan university           China   
1361               yokohama national university           Japan   
1362                         yuan ze university          Taiwan   
1363                        university of ilina        Slovakia   

     QS Citations per Paper QS Academic Reputation QS Employer Reputation  \
0                      93.7                     91                   98.2   
1                      95.3              

In [43]:
# from fuzzywuzzy import fuzz, process
# import pandas as pd

# Assuming you have dataframes named df_qs and df_the

# Define a function to find the best match for a given university name
def find_best_match(name, choices):
    best_match, score = process.extractOne(name, choices)
    if score >= 95:  # Adjust the threshold as needed
        return best_match
    else:
        return name

# Get unique university names from both dataframes
shanghai_universities = df_shanghai['University'].unique()
merged_universities = merged_df['University'].unique()

# Create a master list of all unique university names
all_universities = list(set(merged_universities))

# Standardize university names in df_the
df_shanghai['new University'] = df_shanghai.apply(lambda x: find_best_match(x['University'], all_universities), axis=1)

# Merge the dataframes based on standardized university names
new_merged_df = pd.merge(merged_df, df_shanghai, on=['University'], how='outer')

# Select the desired columns
new_merged_df = new_merged_df[['University', 'Country', 'QS Citations per Paper', 'QS Academic Reputation','QS Employer Reputation',
                       'Citations', 'Research', 'Teaching', 'CNCI', 'TOP']]

# Display the merged dataframe
print(new_merged_df)


                                             University         Country  \
0             massachusetts institute of technology mit   United States   
1                            carnegie mellon university   United States   
2                                   stanford university   United States   
3                 university of california berkeley ucb   United States   
4                                  university of oxford  United Kingdom   
...                                                 ...             ...   
1505                   university of missouri  columbia             NaN   
1506                                university of paris             NaN   
1507  university of shanghai for science and technology             NaN   
1508                 university of tennessee  knoxville             NaN   
1509                uoc universitat oberta de catalunya             NaN   

     QS Citations per Paper QS Academic Reputation QS Employer Reputation  \
0                     

In [44]:
# from fuzzywuzzy import fuzz, process
# import pandas as pd

# Assuming you have a dataframe named merged_df
# Define a function to find the best match for a given university name
def find_best_match(name, choices):
    best_match, score = process.extractOne(name, choices)
    if score >= 93:  # Adjust the threshold as needed
        return best_match
    else:
        return name

# Create a list of all unique university names
all_universities = new_merged_df['University'].unique()

# Standardize university names in the merged DataFrame
new_merged_df['University'] = new_merged_df.apply(lambda x: find_best_match(x['University'], all_universities), axis=1)

# Combine values for similar university names
for idx, row in new_merged_df.iterrows():
    university_name = row['University']
    
    # Find similar matches in the merged DataFrame based on university name (excluding the current row)
    similarity_threshold = 97  # Adjust the similarity score threshold as needed
    similar_matches = new_merged_df[new_merged_df.apply(lambda x: fuzz.token_sort_ratio(x['University'], university_name), axis=1) >= similarity_threshold]
    similar_matches = similar_matches[similar_matches.index != idx]
    
    # Check if there are any similar matches
    if not similar_matches.empty:
        # Get the first similar match (you can implement a more sophisticated logic here)
        similar_row = similar_matches.iloc[0]
        
        # Iterate through the columns to fill in missing values with non-null values from either row
        for column in new_merged_df.columns:
            if pd.isnull(row[column]) and not pd.isnull(similar_row[column]):
                new_merged_df.at[idx, column] = similar_row[column]
            elif pd.isnull(similar_row[column]) and not pd.isnull(row[column]):
                new_merged_df.at[idx, column] = row[column]

# Display the updated DataFrame
print(new_merged_df)

                                             University         Country  \
0             massachusetts institute of technology mit   United States   
1                            carnegie mellon university   United States   
2                                   stanford university   United States   
3                 university of california berkeley ucb   United States   
4                                  university of oxford  United Kingdom   
...                                                 ...             ...   
1505                   university of missouri  columbia   United States   
1506                                university of paris             NaN   
1507  university of shanghai for science and technology             NaN   
1508                 university of tennessee  knoxville             NaN   
1509                uoc universitat oberta de catalunya             NaN   

     QS Citations per Paper QS Academic Reputation QS Employer Reputation  \
0                     

In [45]:
# Fill NaN values with 0 for every row except 'country' column
country_column = new_merged_df['Country']
new_merged_df.drop('Country', axis=1, inplace=True)
new_merged_df.fillna(0, inplace=True)
new_merged_df['Country'] = country_column
new_merged_df[['University', 'Country', 'QS Citations per Paper', "Citations", 'CNCI', 'QS Academic Reputation',
              'QS Employer Reputation', 'Research', 'Teaching', 'TOP']]
new_merged_df

Unnamed: 0,University,QS Citations per Paper,QS Academic Reputation,QS Employer Reputation,Citations,Research,Teaching,CNCI,TOP,Country
0,massachusetts institute of technology mit,93.7,91,98.2,0,0,0,81.1,100.0,United States
1,carnegie mellon university,95.3,100,82.3,99.6,93.4,87.7,73.7,99.8,United States
2,stanford university,99.9,88.5,96.3,100.0,96.3,90.6,87.2,85.5,United States
3,university of california berkeley ucb,100,85.4,90,0,0,0,0,0,United States
4,university of oxford,94.2,82.1,96.7,99.9,92.8,92.4,72.2,63.9,United Kingdom
...,...,...,...,...,...,...,...,...,...,...
1505,university of missouri columbia,75.8,39.2,53.5,0,0,0,73.8,10.5,United States
1506,university of paris,0,0,0,0,0,0,56.7,23.1,
1507,university of shanghai for science and technology,0,0,0,0,0,0,71.1,6.9,
1508,university of tennessee knoxville,0,0,0,0,0,0,63.4,14.5,


In [51]:
import pandas as pd
new_merged_df['QS Citations per Paper'] = pd.to_numeric(new_merged_df['QS Citations per Paper'], errors='coerce')
new_merged_df['Citations'] = pd.to_numeric(new_merged_df['Citations'], errors='coerce')
new_merged_df['CNCI'] = pd.to_numeric(new_merged_df['CNCI'], errors='coerce')

# Calculate the "final citation score" for each row based on the specified logic
def calculate_final_score_citations(row):
    qs_citations_per_paper = row['QS Citations per Paper']
    citations = row['Citations']
    cnci = row['CNCI']

    if qs_citations_per_paper != 0 and citations != 0 and cnci != 0:
        final_score = (((qs_citations_per_paper + citations + cnci) / 3) / 100) * 0.8 + 0.2
    elif (qs_citations_per_paper != 0 and citations != 0) or (qs_citations_per_paper != 0 and cnci != 0) or (citations != 0 and cnci != 0):
        final_score = (((qs_citations_per_paper + citations + cnci) / 2) / 100) * 0.8 + 0.1
    elif qs_citations_per_paper != 0 or citations != 0 or cnci != 0:
        final_score = (max(qs_citations_per_paper, citations, cnci) / 100) * 0.8
    else:
        final_score = 0

    return (final_score*100)

# Apply the calculation to each row and create a new 'final citation score' column
new_merged_df['final citation score'] = new_merged_df.apply(calculate_final_score_citations, axis=1)

# Display the updated DataFrame
print(new_merged_df)


                                             University  \
0             massachusetts institute of technology mit   
1                            carnegie mellon university   
2                                   stanford university   
3                 university of california berkeley ucb   
4                                  university of oxford   
...                                                 ...   
1505                   university of missouri  columbia   
1506                                university of paris   
1507  university of shanghai for science and technology   
1508                 university of tennessee  knoxville   
1509                uoc universitat oberta de catalunya   

      QS Citations per Paper QS Academic Reputation QS Employer Reputation  \
0                       93.7                     91                   98.2   
1                       95.3                    100                   82.3   
2                       99.9                   88.5      

In [None]:
def calculate_final_score_rep(row):
    cols = ['TOP', 'QS Academic Reputation', 'QS Employer Reputation', 'Research', 'Teaching']
    non_zero_cols = [col for col in cols if row[col] != 0]
    non_zero_count = len(non_zero_cols)

    # Convert relevant columns to numeric
    numeric_cols = row[non_zero_cols].apply(pd.to_numeric, errors='coerce')

    if non_zero_count == 5:
        final_score = (((numeric_cols.sum() / 5) / 100) * 0.8) + 0.2
    elif non_zero_count == 4:
        final_score = (((numeric_cols.sum() / 4) / 100) * 0.8) + 0.15
    elif non_zero_count == 3:
        final_score = (((numeric_cols.sum() / 3) / 100) * 0.8) + 0.1
    elif non_zero_count == 2:
        final_score = (((numeric_cols.sum() / 2) / 100) * 0.8) + 0.05
    elif non_zero_count == 1:
        final_score = (numeric_cols[non_zero_cols[0]] / 100) * 0.8
    else:
        final_score = 0

    return (final_score * 100)

# Apply the calculation to each row and create a new 'final reputation score' column
new_merged_df['final reputation score'] = new_merged_df.apply(calculate_final_score_rep, axis=1)

# Display the updated DataFrame
print(new_merged_df)

In [54]:
new_merged_df['overall score'] = 0.2 * new_merged_df['final citation score'] + 0.8 * new_merged_df['final reputation score']
new_merged_df.sort_values(by='overall score', ascending=False, inplace=True)
new_merged_df.reset_index(drop=True, inplace=True)
new_merged_df.index += 1
new_merged_df

In [1]:
new_merged_df.to_csv('FINAL_ranking_list.csv')

NameError: name 'new_merged_df' is not defined

In [None]:
# Combined