# ADA Homework 2 - Data from the Web

In [103]:
# Import libraries
import requests
import json
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

## 1. 200 QS Top Universities from www.topuniversities.com 

In [104]:
NB_BEST = 200
QS_URL = "https://www.topuniversities.com"
QS_RANK_URL = "https://www.topuniversities.com/university-rankings/world-university-rankings/2018"
QS_DATA_URL = "https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt"

In [105]:
QS_data_req = requests.get(QS_DATA_URL)
QS_data = QS_data_req.json()

In [106]:
def get_QS_number(type_string):
    wrapper = uni_soup.find("div", class_=type_string)
    if wrapper:
        string = wrapper.find("div", class_="number").text[1:]
        return int(re.sub("[^0-9^.]", "", string))
    else:
        return np.nan

In [107]:
QS_ranking = pd.DataFrame(columns = ["nid", "title", "rank_display", "country", "region", "score"])

for i in range(NB_BEST):
    df_i = pd.DataFrame.from_dict(QS_data["data"][i], orient='index').T
    df_i = df_i.drop(["stars", "guide", "cc", "logo", "core_id", "url"], axis=1)
    
    # Get linked site with more information
    uni_url = QS_URL + QS_data["data"][i]["url"]
    uni_request = requests.get(uni_url)
    uni_page_body = uni_request.text
    uni_soup = BeautifulSoup(uni_page_body, 'html.parser')
    
    # Get total number of students
    df_i["total_students"] = get_QS_number("total student")
    # Get number of international students
    df_i["international_students"] = get_QS_number("total inter")
    # Get number of faculty members
    df_i["faculty_members"] = get_QS_number("total faculty")
    # Get number of international faculty members
    df_i["inter_faculty_members"] = get_QS_number("inter faculty")
    
    QS_ranking = QS_ranking.append(df_i)
    
    
QS_ranking = QS_ranking.add_suffix("_QS")
QS_ranking = QS_ranking.rename(columns = {'title_QS':'name'})
QS_ranking = QS_ranking.set_index("name")


# Cannot convert columns to int, because NaN's can only exist in float or object columns
#QS_ranking.total_students = QS_ranking.total_students.astype(int)
#QS_ranking.international_students = QS_ranking.international_students.astype(int)
#QS_ranking.faculty_members = QS_ranking.faculty_members.astype(int)
#QS_ranking.inter_faculty_members = QS_ranking.inter_faculty_members.astype(int)

In [108]:
QS_ranking.head()

Unnamed: 0_level_0,country_QS,faculty_members_QS,inter_faculty_members_QS,international_students_QS,nid_QS,rank_display_QS,region_QS,score_QS,total_students_QS
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Massachusetts Institute of Technology (MIT),United States,2982.0,1679.0,3717.0,294850,1,North America,100.0,11067.0
Stanford University,United States,4285.0,2042.0,3611.0,297282,2,North America,98.7,15878.0
Harvard University,United States,4350.0,1311.0,5266.0,294270,3,North America,98.4,22429.0
California Institute of Technology (Caltech),United States,953.0,350.0,647.0,294562,4,North America,97.7,2255.0
University of Cambridge,United Kingdom,5490.0,2278.0,6699.0,294561,5,Europe,95.6,18770.0


In [109]:
QS_ranking.describe()

Unnamed: 0,faculty_members_QS,inter_faculty_members_QS,international_students_QS,total_students_QS
count,199.0,198.0,199.0,199.0
mean,2912.949749,786.409091,5078.261307,25709.130653
std,2010.741417,666.690652,3924.655224,17135.556845
min,86.0,1.0,47.0,532.0
25%,1636.5,327.0,2552.5,15140.5
50%,2605.0,599.5,4250.0,23775.0
75%,3633.5,1027.75,6880.5,32216.0
max,16421.0,3905.0,27109.0,141939.0


In [110]:
QS_ranking.dtypes

country_QS                    object
faculty_members_QS           float64
inter_faculty_members_QS     float64
international_students_QS    float64
nid_QS                        object
rank_display_QS               object
region_QS                     object
score_QS                      object
total_students_QS            float64
dtype: object

## 2. 200 TH Top Universities from www.timeshighereducation.com

In [111]:
TH_URL = "http://timeshighereducation.com"
TH_RANK_URL = "http://timeshighereducation.com/world-university-rankings/2018/world-ranking"
TH_DATA_URL = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"

In [112]:
TH_data_req = requests.get(TH_DATA_URL)
TH_data = TH_data_req.json()

In [114]:
TH_ranking = pd.DataFrame(columns = ["nid", "name", "rank", "scores_overall", "location"])

for i in range(NB_BEST):
    df_i = pd.DataFrame.from_dict(TH_data["data"][i], orient='index').T
    drop_cols = df_i.columns.difference(["nid", "name", "rank", "scores_overall", "location"])
    df_i = df_i.drop(drop_cols, axis=1)
    
    df_i["total_students"] = int(re.sub("[^0-9^.]", "", TH_data["data"][i]["stats_number_students"]))
    percentage_intl = int(re.sub("[^0-9^.]", "", TH_data["data"][i]["stats_pc_intl_students"]))
    df_i["international_students"] = int(df_i.total_students * percentage_intl / 100)
    
    TH_ranking = TH_ranking.append(df_i)
    
TH_ranking = TH_ranking.add_suffix("_TH")
TH_ranking = TH_ranking.rename(columns = {'name_TH':'name'})
TH_ranking = TH_ranking.set_index("name")
TH_ranking.total_students = TH_ranking.total_students_TH.astype(int)
TH_ranking.international_students = TH_ranking.international_students_TH.astype(int)

In [115]:
TH_ranking.head()

Unnamed: 0_level_0,international_students_TH,location_TH,nid_TH,rank_TH,scores_overall_TH,total_students_TH
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
University of Oxford,7755.0,United Kingdom,468,1,94.3,20409.0
University of Cambridge,6436.0,United Kingdom,470,2,93.2,18389.0
California Institute of Technology,596.0,United States,128779,=3,93.0,2209.0
Stanford University,3485.0,United States,467,=3,93.0,15845.0
Massachusetts Institute of Technology,3800.0,United States,471,5,92.5,11177.0


In [116]:
TH_ranking.describe()

Unnamed: 0,international_students_TH,total_students_TH
count,200.0,200.0
mean,4964.715,24746.295
std,3108.431593,13353.01984
min,36.0,515.0
25%,2789.0,14880.25
50%,4437.5,23130.5
75%,6693.5,32933.25
max,16846.0,69427.0


In [117]:
TH_ranking.dtypes

international_students_TH    float64
location_TH                   object
nid_TH                        object
rank_TH                       object
scores_overall_TH             object
total_students_TH            float64
dtype: object

In [118]:
import difflib

In [122]:
def match_up(x, indexes):
    matches = difflib.get_close_matches(x, indexes)
    if matches:
        return matches[0]
    else:
        return x

QS_ranking.index = QS_ranking.index.map(lambda x: match_up(x, TH_ranking.index))
merged = QS_ranking.join(TH_ranking, how='outer')
merged.head()

Unnamed: 0_level_0,country_QS,faculty_members_QS,inter_faculty_members_QS,international_students_QS,nid_QS,rank_display_QS,region_QS,score_QS,total_students_QS,international_students_TH,location_TH,nid_TH,rank_TH,scores_overall_TH,total_students_TH
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Aalto University,Finland,1257.0,370.0,1831.0,294263.0,=137,Europe,56.0,12147.0,2548.0,Finland,767,190,52.7,12744.0
Aarhus University,Denmark,2316.0,602.0,3762.0,294648.0,119,Europe,59.3,26226.0,3020.0,Denmark,589,=109,59.4,25167.0
Arizona State University,,,,,,,,,,8450.0,United States,591,=126,58.1,44475.0
Australian National University,Australia,1600.0,927.0,5551.0,294616.0,20,Oceania,87.1,14442.0,5595.0,Australia,502,48,71.6,15986.0
Autonomous University of Barcelona,Italy,2990.0,153.0,4195.0,294595.0,=188,Europe,49.1,63399.0,5169.0,Spain,666,=147,56.2,32309.0
