In [8]:
# Import libraries
import requests, json
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [9]:
def import_json(url):
    r = requests.get(url)
    universityJson = json.loads(r.text)
    top200 = pd.DataFrame(universityJson['data'])[:200] #import the top200 universities
    return top200

def import_details_qs(top200): #requires special handling as international stud/fac data not available
    Intl_Data = pd.DataFrame()
    for index, row in top200.iterrows():
        nid = row['nid']
        Intl_link = 'https://www.topuniversities.com/node/'+str(nid)
        rIntl = requests.get(Intl_link)
        
        if rIntl.ok:
            page_body = rIntl.text
            soup = BeautifulSoup(page_body, 'html.parser')
            #Total n - faculty
            try :
                t_faculty = soup.find_all('div', class_='total faculty')[0].find('div', class_='number').text
                tot_fac_value = int(t_faculty[1:-1].replace(',',''))
            except:
                tot_fac_value = 0
            #Total n - international faculty
            try:
                t_interfaculty = soup.find_all('div',class_='inter faculty')[0].find('div',class_='number').text
                tot_intfac_value = int(t_interfaculty[1:-1].replace(',',''))
            except:
                tot_intfac_value = 0
            #Total n - students
            try:
                t_stud = soup.find_all('div', class_='total student')[0].find('div', class_='number').text
                tot_stud_value = int(t_stud[1:-1].replace(',',''))
            except:
                tot_stud_value = 0
            #Total n - international students
            try:
                t_intstud = soup.find_all('div', class_='total inter')[0].find('div', class_='number').text
                tot_intstud_value = int(t_intstud[1:-1].replace(',',''))
            except:
                tot_intstud_value = 0
            IntlDF = pd.DataFrame({'nid' : [nid],'total_faculty_members' : [tot_fac_value], 'international_faculty_members' : [tot_intfac_value], 'total_students' : [tot_stud_value], 'international_students' : [tot_intstud_value]})
            Intl_Data = Intl_Data.append(IntlDF, ignore_index=True)
    return Intl_Data

def import_details_times(top200):
    # Get the supplementary data from the university page
    locationData = pd.DataFrame()
    for index, row in top200.iterrows():
        nid = row['nid']
        #comparisonLink = 'https://www.timeshighereducation.com/sites/default/files/university/comparison/'+str(nid)+'.json'
        #rankingsLink = 'https://www.timeshighereducation.com/sites/default/files/university/rankings/'+str(nid)+'.json'
        locationLink = 'https://www.timeshighereducation.com/sites/default/files/institution_markers/gmap_'+str(nid)+'.json'
        rLocation = requests.get(locationLink)
        if rLocation.ok:
            locationJson = json.loads(rLocation.text)
            locationDF = pd.DataFrame(locationJson)
            locationData = locationData.append(locationDF, ignore_index=True)
    # Add location data to top200 dataframe
       #locationData = locationData.rename(columns={'title':'name'})
    #top200 = pd.merge(top200, locationData, on='name')
    return locationData

## QS International Rankings

In [10]:
%%time
top200_qs = import_json('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508492427994')
data_qs = import_details_qs(top200_qs)

CPU times: user 59 s, sys: 1.08 s, total: 1min
Wall time: 1min 46s


In [21]:
top200_qs_full = pd.merge(top200, data_qs, on='nid')
top200_qs_full.rename(columns={'title':'name'})
top200_qs_full.head()

Unnamed: 0,cc,core_id,country,guide,logo,nid,rank_display,region,score,stars,title,url,international_faculty_members,international_students,total_faculty_members,total_students
0,US,410,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294850,1,North America,100.0,6,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...,1679,3717,2982,11067
1,US,573,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",297282,2,North America,98.7,5,Stanford University,/universities/stanford-university,2042,3611,4285,15878
2,US,253,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294270,3,North America,98.4,5,Harvard University,/universities/harvard-university,1311,5266,4350,22429
3,US,94,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294562,4,North America,97.7,5,California Institute of Technology (Caltech),/universities/california-institute-technology-...,350,647,953,2255
4,GB,95,United Kingdom,"<a href=""/where-to-study/europe/united-kingdom...","<img src=""https://www.topuniversities.com/site...",294561,5,Europe,95.6,5,University of Cambridge,/universities/university-cambridge,2278,6699,5490,18770


## Times Higher Education International Rankings

In [18]:
%%time
top200_times = import_json('https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json')
locdata_times = import_details_times(top200_times)

CPU times: user 5.86 s, sys: 342 ms, total: 6.2 s
Wall time: 41.5 s


In [23]:
locdata_times = locdata_times.rename(columns={'title':'name'})
top200_times_full = pd.merge(top200_times, locdata_times, on='name')

In [24]:
top200_times_full.head()

Unnamed: 0,aliases,location,member_level,name,nid,rank,rank_order,record_type,scores_citations,scores_citations_rank,...,stats_student_staff_ratio,subjects_offered,url,country,lat,lng,locality,path,postal_code,thoroughfare
0,University of Oxford,United Kingdom,0,University of Oxford,468,1,10,master_account,99.1,15,...,11.2,"Archaeology,Art, Performing Arts & Design,Biol...",/world-university-rankings/university-oxford,United Kingdom,51.7577018738,-1.2639590502,Oxford,node/468,OX1 2JD,University Offices
1,University of Cambridge,United Kingdom,0,University of Cambridge,470,2,20,master_account,97.5,29,...,10.9,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/university-cambridge,United Kingdom,52.2049598694,0.1160930023,Cambridge,node/470,CB2 1TN,The Old Schools
2,California Institute of Technology caltech,United States,0,California Institute of Technology,128779,=3,30,private,99.5,10,...,6.5,"Architecture,Biological Sciences,Business & Ma...",/world-university-rankings/california-institut...,United States,34.1359519959,-118.1256332397,Pasadena,node/128779,91125,1200 East California Boulevard
3,Stanford University,United States,11,Stanford University,467,=3,40,private,99.9,4,...,7.5,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/stanford-university,United States,37.4276580811,-122.1700668335,Stanford,node/467,94305–2004,450 Serra Mall
4,Massachusetts Institute of Technology,United States,0,Massachusetts Institute of Technology,471,5,50,private,100.0,1,...,8.7,"Architecture,Art, Performing Arts & Design,Bio...",/world-university-rankings/massachusetts-insti...,United States,42.360091,-71.09416,Cambridge,node/471,02139-4307,77 Massachusetts Avenue


## Merging both datasets by name

In [45]:
top200_qs_full = top200_qs_full.rename(columns={'title':'name'})
top200_qs_full.head()

Unnamed: 0,cc,core_id,country,guide,logo,nid,rank_display,region,score,stars,name,url,international_faculty_members,international_students,total_faculty_members,total_students
0,US,410,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294850,1,North America,100.0,6,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...,1679,3717,2982,11067
1,US,573,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",297282,2,North America,98.7,5,Stanford University,/universities/stanford-university,2042,3611,4285,15878
2,US,253,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294270,3,North America,98.4,5,Harvard University,/universities/harvard-university,1311,5266,4350,22429
3,US,94,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294562,4,North America,97.7,5,California Institute of Technology (Caltech),/universities/california-institute-technology-...,350,647,953,2255
4,GB,95,United Kingdom,"<a href=""/where-to-study/europe/united-kingdom...","<img src=""https://www.topuniversities.com/site...",294561,5,Europe,95.6,5,University of Cambridge,/universities/university-cambridge,2278,6699,5490,18770


In [46]:
top200_times_full.head()

Unnamed: 0,aliases,location,member_level,name,nid,rank,rank_order,record_type,scores_citations,scores_citations_rank,scores_industry_income,scores_industry_income_rank,scores_international_outlook,scores_international_outlook_rank,scores_overall,scores_overall_rank,scores_research,scores_research_rank,scores_teaching,scores_teaching_rank,stats_female_male_ratio,stats_number_students,stats_pc_intl_students,stats_student_staff_ratio,subjects_offered,url,country,lat,lng,locality,path,postal_code,thoroughfare
0,University of Oxford,United Kingdom,0,University of Oxford,468,1,10,master_account,99.1,15,63.7,169,95.0,24,94.3,10,99.5,1,86.7,5,46 : 54,20409,38%,11.2,"Archaeology,Art, Performing Arts & Design,Biol...",/world-university-rankings/university-oxford,United Kingdom,51.7577018738,-1.2639590502,Oxford,node/468,OX1 2JD,University Offices
1,University of Cambridge,United Kingdom,0,University of Cambridge,470,2,20,master_account,97.5,29,51.5,260,93.0,35,93.2,20,97.8,3,87.8,3,45 : 55,18389,35%,10.9,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/university-cambridge,United Kingdom,52.2049598694,0.1160930023,Cambridge,node/470,CB2 1TN,The Old Schools
2,California Institute of Technology caltech,United States,0,California Institute of Technology,128779,=3,30,private,99.5,10,92.6,51,59.7,322,93.0,30,97.5,4,90.3,1,31 : 69,2209,27%,6.5,"Architecture,Biological Sciences,Business & Ma...",/world-university-rankings/california-institut...,United States,34.1359519959,-118.1256332397,Pasadena,node/128779,91125,1200 East California Boulevard
3,Stanford University,United States,11,Stanford University,467,=3,40,private,99.9,4,60.5,189,77.6,162,93.0,40,96.7,5,89.1,2,42 : 58,15845,22%,7.5,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/stanford-university,United States,37.4276580811,-122.1700668335,Stanford,node/467,94305–2004,450 Serra Mall
4,Massachusetts Institute of Technology,United States,0,Massachusetts Institute of Technology,471,5,50,private,100.0,1,88.4,63,87.6,81,92.5,50,91.9,9,87.3,4,37 : 63,11177,34%,8.7,"Architecture,Art, Performing Arts & Design,Bio...",/world-university-rankings/massachusetts-insti...,United States,42.360091,-71.09416,Cambridge,node/471,02139-4307,77 Massachusetts Avenue


In [43]:
top200_merged = pd.merge(top200_qs_full, top200_times_full, on='name', how ='outer')

In [47]:
pd.set_option('display.max_columns', 500)
top200_merged.head()

Unnamed: 0,cc,core_id,country_x,guide,logo,nid_x,rank_display,region,score,stars,name,url_x,international_faculty_members,international_students,total_faculty_members,total_students,aliases,location,member_level,nid_y,rank,rank_order,record_type,scores_citations,scores_citations_rank,scores_industry_income,scores_industry_income_rank,scores_international_outlook,scores_international_outlook_rank,scores_overall,scores_overall_rank,scores_research,scores_research_rank,scores_teaching,scores_teaching_rank,stats_female_male_ratio,stats_number_students,stats_pc_intl_students,stats_student_staff_ratio,subjects_offered,url_y,country_y,lat,lng,locality,path,postal_code,thoroughfare
0,US,410,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294850,1,North America,100.0,6,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...,1679.0,3717.0,2982.0,11067.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,US,573,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",297282,2,North America,98.7,5,Stanford University,/universities/stanford-university,2042.0,3611.0,4285.0,15878.0,Stanford University,United States,11.0,467.0,=3,40.0,private,99.9,4.0,60.5,189.0,77.6,162.0,93.0,40.0,96.7,5.0,89.1,2.0,42 : 58,15845.0,22%,7.5,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/stanford-university,United States,37.4276580811,-122.1700668335,Stanford,node/467,94305–2004,450 Serra Mall
2,US,253,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294270,3,North America,98.4,5,Harvard University,/universities/harvard-university,1311.0,5266.0,4350.0,22429.0,Harvard University,United States,0.0,466.0,6,60.0,private,99.7,8.0,46.4,330.0,79.7,143.0,91.8,60.0,98.4,2.0,84.2,9.0,,20326.0,26%,8.9,"Agriculture & Forestry,Archaeology,Art, Perfor...",/world-university-rankings/harvard-university,United States,42.3745993,-71.1204804,Cambridge,node/466,02138,Massachusetts Hall
3,US,94,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294562,4,North America,97.7,5,California Institute of Technology (Caltech),/universities/california-institute-technology-...,350.0,647.0,953.0,2255.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,GB,95,United Kingdom,"<a href=""/where-to-study/europe/united-kingdom...","<img src=""https://www.topuniversities.com/site...",294561,5,Europe,95.6,5,University of Cambridge,/universities/university-cambridge,2278.0,6699.0,5490.0,18770.0,University of Cambridge,United Kingdom,0.0,470.0,2,20.0,master_account,97.5,29.0,51.5,260.0,93.0,35.0,93.2,20.0,97.8,3.0,87.8,3.0,45 : 55,18389.0,35%,10.9,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/university-cambridge,United Kingdom,52.2049598694,0.1160930023,Cambridge,node/470,CB2 1TN,The Old Schools
