In [None]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [None]:
# input: link to the university details page where we can find the detail numbers
# returns an array as follows: [total_academic_fac_staff, inter_academic_fac_staff, total_students, inter_students]
def get_details(link):
    link = link + '#wurs'
    r = requests.get(link)
    page_body = r.text

    soup = BeautifulSoup(page_body, 'html.parser')
    
    # will contain total_academic_fac_staff, inter_academic_fac_staff, total_students, inter_students
    results = []
    for name in ['total faculty', 'inter faculty', 'total student', 'total inter']:
        try:
            elem = np.float(soup.find('div', class_= name).find('div', class_='number').text.replace(',',''))
        except AttributeError:
            elem = None
        results.append(elem)
    return results

In [None]:
# send a request to get the content of the ranking list
# We found the correct url where to send the request using postman and checking the different requests that were sent
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051_indicators.txt')
page_body = r.text
json_data = r.json()
cols = ['QSrank', 'region', 'country', 'Faculty members (total)', 'Faculty members (inter.)', 'Students (total)', 'Students (inter.)']
df_QS = pd.DataFrame(columns=cols)


# for each univerity in the list, get basic attributes and try to get the details
for i, uni in enumerate(json_data['data'][:200]):
    soup = BeautifulSoup(uni['uni'], 'html.parser')
    details_link = 'https://www.topuniversities.com' + soup.a['href']
    name = soup.a.string
    country = uni['location']
    region = uni['region']
    rank = np.float(uni['overall_rank'])
    details = get_details(details_link)
    # we saw that for some reason the uni ranked 281 appeared in the top-200.
    # this check avoids having it in our ranking
    if int(rank) <= 200:
        df_QS.loc[name] = ([rank, region, country] + details)
    print("Progression: " + '{0:.1f}'.format(100*(i+1)/200.0) + "%", end="\r")
#print("Progression: Done!", end="\r")

In [None]:
#df[pd.isnull(df['Faculty members (inter.)'])]
df_QS.head(5)

In [None]:
#Top 10 universities with the largest fac members/students ratio
df_QS['Fac members/Students ratio'] = df_QS['Faculty members (total)']/df_QS['Students (total)']
resultfirstratio = df_QS.sort_values('Fac members/Students ratio' , ascending = False )
resultfirstratio.head(10)

In [None]:
#Top 10 universities with the largest international students ratio
df_QS['International students ratio'] = df._QS['Students (inter.)']/df_QS['Students (total)']
resultsecondratio = df_QS.sort_values('International students ratio' , ascending = False )
resultsecondratio.head(10)

In [None]:
#Compute the ratio between faculty members and students per country
Country_facmembers = df_QS.groupby('country')['Faculty members (total)'].sum()
Country_students = df_QS.groupby('country')['Students (total)'].sum()
Country_facmembers.div(Country_students).sort_values(ascending=False).plot(kind = 'bar')

In [None]:
#Compute the ratio of international students per country
Country_intstudents = df_QS.groupby('country')['Students (inter.)'].sum()
Country_intstudents.div(Country_students).sort_values(ascending=False).plot(kind = 'bar')

In [None]:
#Compute the ratio between faculty members and students per region
Region_facmembers = df_QS.groupby('region')['Faculty members (total)'].sum()
Region_students = df_QS.groupby('region')['Students (total)'].sum()
Region_facmembers.div(Region_students).sort_values(ascending=False).plot(kind = 'bar')

In [None]:
#Compute the ratio of international students per region
Region_intstudents = df_QS.groupby('region')['Students (inter.)'].sum()
Region_intstudents.div(Region_students).sort_values(ascending=False).plot(kind = 'bar')

### Times Higher education


In [None]:
# Make the request
r = requests.get('https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json')
page_body = r.text
page_body[:800]

soup = BeautifulSoup(page_body, 'html.parser')

In [None]:
json_data = r.json()

In [None]:
cols = ['THrank',  'country', 'Students (total)', 'Students (inter.)','International students ratio', 'Fac members/Students ratio','Faculty members (total)',]
df_TH = pd.DataFrame(columns=cols)


# for each univerity in the list, get basic attributes and try to get the details
for i, uni in enumerate(json_data['data'][:200]):
    name = uni['name']
    country = uni['location']
    THrank = np.float16(uni['rank'].replace('=',''))
    students_tot = np.float16(uni['stats_number_students'].replace(',',''))
    student_int_ratio = np.float16(uni['stats_pc_intl_students'][:-1])/100
    student_staff_ratio = np.float(uni['stats_student_staff_ratio'])
    staff_number = np.float16(students_tot/student_staff_ratio)
    int_student = np.float16(students_tot*student_int_ratio)
    
    df_TH.loc[name] = ([THrank, country, students_tot, int_student, student_int_ratio, student_staff_ratio,staff_number])
    print("Progression: " + '{0:.1f}'.format(100*(i+1)/200.0) + "%", end="\r")

In [None]:
#Top 10 universities with the largest fac members/students ratio
resultfirstratio = df_TH.sort_values('Fac members/Students ratio' , ascending = True )
resultfirstratio.head(10)

In [None]:
#Top 10 universities with the largest international students ratio
df_TH['International students ratio'] = df_TH['Students (inter.)']/df_TH['Students (total)']
resultsecondratio = df_TH.sort_values('International students ratio' , ascending = False )
resultsecondratio.head(10)

In [None]:
#Compute the ratio between faculty members and students per country
Country_facmembers = df_TH.groupby('country')['Faculty members (total)'].sum()
Country_students = df_TH.groupby('country')['Students (total)'].sum()
plt.figure(figsize=[10,10])
Country_facmembers.div(Country_students).sort_values(ascending=False).plot(kind = 'bar')
plt.title('Ratio between number of faculty members and students')

In [None]:
#Compute the ratio of international students per country
Country_intstudents = df_TH.groupby('country')['Students (inter.)'].sum()
Country_intstudents.div(Country_students).sort_values(ascending=False).plot(kind = 'bar')
plt.title('Ratio of international students per country')

In [None]:
for a in df_TH.THrank:
    if a[0] == '=':
        df_TH.THrank[a] = a[1:]


In [None]:
df_TH.head()

### Merging the two datasets


In [None]:
a = df_QS.index.values.tolist()
b = df_TH.index.values.tolist()
ab = set(a).intersection(b)

In [None]:
df_merged = df_QS.merge(df_TH,how='outer')

In [None]:
df_TH.describe()

In [None]:
df_TH.corr()

The most important correlation with the TH rank is the number of international students. The ratio between FAC members and student is positive because the ratio is smaller for a smaller number of students per teacher. 

In [None]:
df_QS.corr()

The total number of students has no importantce in the QS ranking of the universities. However, the ratio of students over faculty members is highly negative. The number of faculty members and international students is ghly correlated with the ranking