In [101]:
# Import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [125]:
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051_indicators.txt')
''' Note that this list is the top 200 only if the whole list is sorted
    This check is done at the end, after the dataframe has been created (see below).
    Moreover, this code would work for any 'top k' '''  
data = r.json()['data'][:200]

In [126]:
df = pd.DataFrame(columns=['Rank', 'Country', 'Region', 'Total Faculty Members',
                           'International Faculty Members', 'Total Students', 
                           'International Students'])
df.index.name = 'Name'

'''Get a single number from the details webpage. ClassName is the information on the webpage
   to be returned.'''
def getCount(details, className):
        attr = BeautifulSoup(details.text, 'html.parser').find('div', class_=className)
        if attr is None:
            print ("No attribute '", className, "' found")
            return None
        return int(attr.find('div', class_="number").text.strip().replace(',', ''))

'Dataframe generation'    
for d in data:
    soup = BeautifulSoup(d['uni'], 'html.parser')
    details = requests.get("https://www.topuniversities.com" + soup.a.get('href'))
    index = soup.a.text.strip()
    print("Processing", d['overall_rank'], ":", index)
    df.loc[index] = [int(d['overall_rank']), d['location'], d['region'], 
                     getCount(details, 'total faculty'), 
                     getCount(details, 'inter faculty'),
                     getCount(details, 'total student'),
                     getCount(details, 'total inter')]

Processing 1 : Massachusetts Institute of Technology (MIT)
Processing 2 : Stanford University
Processing 3 : Harvard University
Processing 4 : California Institute of Technology (Caltech)
Processing 5 : University of Cambridge
Processing 6 : University of Oxford
Processing 7 : UCL (University College London)
Processing 8 : Imperial College London
Processing 9 : University of Chicago
Processing 10 : ETH Zurich - Swiss Federal Institute of Technology
Processing 11 : Nanyang Technological University, Singapore (NTU)
Processing 12 : Ecole Polytechnique Fédérale de Lausanne (EPFL)
Processing 13 : Princeton University
Processing 14 : Cornell University
Processing 15 : National University of Singapore (NUS)
Processing 16 : Yale University
Processing 17 : Johns Hopkins University
Processing 18 : Columbia University
Processing 19 : University of Pennsylvania
Processing 20 : The Australian National University
Processing 21 : Duke University
Processing 21 : University of Michigan
Processing 23 : 

Processing 182 : Universidade Estadual de Campinas (Unicamp)
Processing 182 : University of Colorado Boulder
Processing 182 : Vrije Universiteit Brussel (VUB)
Processing 186 : University of Rochester
Processing 187 : Universidad Autónoma de Madrid
Processing 188 : Alma Mater Studiorum - University of Bologna
Processing 188 : University of Reading
Processing 190 : Indian Institute of Science (IISc) Bangalore
No attribute ' inter faculty ' found
Processing 191 : University of Cape Town
Processing 192 : Keio University CEMS MIM
Processing 192 : Scuola Normale Superiore di Pisa
Processing 192 : Scuola Superiore Sant'Anna Pisa di Studi Universitari e di Perfezionamento
Processing 195 : Stockholm University
Processing 281 : Technische Universität Dresden
Processing 195 : Texas A&M University
Processing 195 : Universitat Autònoma de Barcelona
Processing 199 : Instituto Tecnológico y de Estudios Superiores de Monterrey
Processing 200 : Maastricht University


In [127]:
'Check that the dataframe has the correct length'
len(df)

200

In [128]:
'Now we can check for rank correctness'
# There is no built-in method
def checkSorted(df, colName):
    col = df[colName]
    for i in range(len(col) - 1):
        if col[i + 1] < col[i] or col[i] > len(col):
            print("The list is not correct:")
            print(df.iloc[i], "\n")
            print(df.iloc[i + 1])
            return False
    return col[len(col) - 1] <= len(col)

checkSorted(df, 'Rank')

The list is not sorted:
Rank                                 281
Country                          Germany
Region                            Europe
Total Faculty Members               4913
International Faculty Members        669
Total Students                     34029
International Students              4929
Name: Technische Universität Dresden, dtype: object 

Rank                                       195
Country                          United States
Region                           North America
Total Faculty Members                     3446
International Faculty Members              206
Total Students                           60294
International Students                    4900
Name: Texas A&M University, dtype: object


False

In [129]:
 ''' As of 20/10/2017, in the potential 'top 200' the University of Dresden has rank 281.
 However, we suspect it is wrong: there is no university with rank 198 (there are 3 universities with 
 rank 195 and one with 199) and there are two universities with rank 281, including the University of Dresden.
 However, rank 282 is filled and all the ranks afterwards are correct. This leads us to believe that one of the 
 two universities with rank 281 should have rank 198 or 195. Since the University of Dresden is indexed among 
 the universities of rank 195, the simplest way to restore ordering in the university list is to assign rank 195
 to the university of Dresden.'''

df.loc['Technische Universität Dresden', 'Rank'] = 195
checkSorted(df, 'Rank')

True

In [130]:
df

Unnamed: 0_level_0,Rank,Country,Region,Total Faculty Members,International Faculty Members,Total Students,International Students
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Massachusetts Institute of Technology (MIT),1,United States,North America,2982,1679,11067,3717
Stanford University,2,United States,North America,4285,2042,15878,3611
Harvard University,3,United States,North America,4350,1311,22429,5266
California Institute of Technology (Caltech),4,United States,North America,953,350,2255,647
University of Cambridge,5,United Kingdom,Europe,5490,2278,18770,6699
University of Oxford,6,United Kingdom,Europe,6750,2964,19720,7353
UCL (University College London),7,United Kingdom,Europe,6345,2554,31080,14854
Imperial College London,8,United Kingdom,Europe,3930,2071,16090,8746
University of Chicago,9,United States,North America,2449,635,13557,3379
ETH Zurich - Swiss Federal Institute of Technology,10,Switzerland,Europe,2477,1886,19815,7563


In [111]:
df.index.is_unique

True