In [None]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# input: link to the university details page where we can find the detail numbers
# returns an array as follows: [total_academic_fac_staff, inter_academic_fac_staff, total_students, inter_students]
def get_details(link):
    link = link + '#wurs'
    r = requests.get(link)
    page_body = r.text

    soup = BeautifulSoup(page_body, 'html.parser')
    
    # will contain total_academic_fac_staff, inter_academic_fac_staff, total_students, inter_students
    results = []
    for name in ['total faculty', 'inter faculty', 'total student', 'total inter']:
        try:
            elem = int(soup.find('div', class_= name).find('div', class_='number').text.replace(',',''))
        except AttributeError:
            elem = None
        results.append(elem)
    return results

In [None]:
# send a request to get the content of the ranking list
# We found the correct url where to send the request using postman and checking the different requests that were sent
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051_indicators.txt')
page_body = r.text
json_data = r.json()
cols = ['QSrank', 'region', 'country', 'Faculty members (total)', 'Faculty members (inter.)', 'Students (total)', 'Students (inter.)']
df = pd.DataFrame(columns=cols)


# for each univerity in the list, get basic attributes and try to get the details
for i, uni in enumerate(json_data['data'][:200]):
    soup = BeautifulSoup(uni['uni'], 'html.parser')
    details_link = 'https://www.topuniversities.com' + soup.a['href']
    name = soup.a.string
    country = uni['location']
    region = uni['region']
    rank = uni['overall_rank'] 
    details = get_details(details_link)
    # we saw that for some reason the uni ranked 281 appeared in the top-200.
    # this check avoids having it in our ranking
    if int(rank) <= 200:
        df.loc[name] = ([rank, region, country] + details)
    print("Progression: " + '{0:.1f}'.format(100*(i+1)/200.0) + "%", end="\r")
#print("Progression: Done!", end="\r")

In [None]:
#df[pd.isnull(df['Faculty members (inter.)'])]
df.head(5)

In [None]:
#Top 10 universities with the largest fac members/students ratio
df['Fac members/Students ratio'] = df['Faculty members (total)']/df['Students (total)']
resultfirstratio = df.sort_values('Fac members/Students ratio' , ascending = False )
resultfirstratio.head(10)

In [None]:
#Top 10 universities with the largest international students ratio
df['International students ratio'] = df['Students (inter.)']/df['Students (total)']
resultsecondratio = df.sort_values('International students ratio' , ascending = False )
resultsecondratio.head(10)

In [None]:
#Compute the ratio between faculty members and students per country
Country_facmembers = df.groupby('country')['Faculty members (total)'].sum()
Country_students = df.groupby('country')['Students (total)'].sum()
Country_facmembers.div(Country_students).sort_values(ascending=False).plot(kind = 'bar')

In [None]:
#Compute the ratio of international students per country
Country_intstudents = df.groupby('country')['Students (inter.)'].sum()
Country_intstudents.div(Country_students).sort_values(ascending=False).plot(kind = 'bar')

In [None]:
#Compute the ratio between faculty members and students per region
Region_facmembers = df.groupby('region')['Faculty members (total)'].sum()
Region_students = df.groupby('region')['Students (total)'].sum()
Region_facmembers.div(Region_students).sort_values(ascending=False).plot(kind = 'bar')

In [None]:
#Compute the ratio of international students per region
Region_intstudents = df.groupby('region')['Students (inter.)'].sum()
Region_intstudents.div(Region_students).sort_values(ascending=False).plot(kind = 'bar')