# 02 - Data from the Web

In [14]:
import pandas as pd
import numpy as np
from requests import get
from bs4 import BeautifulSoup as bs
import json
import re

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.import seaborn as sns

## Useful functions

In [196]:
def clean_number(n):
    return int(''.join(filter(lambda c: c.isdigit(), n)))
def select_or_zero(soup, select):
    selection = soup.select(select)
    if len(selection) == 0:
        return np.nan
    return clean_number(selection[0].text)
def normalize_name(name):
    name = str(name)
    name = re.sub('\(.*\)','', name)
    name = re.sub('^The','', name)
    return name.strip()

## Top Universities

In [206]:
def get_stats(directory):
    res = get('https://www.topuniversities.com'+directory)
    soup = bs(res.text, "lxml")
    faculty_total = select_or_zero(soup, '.total .text .number')
    faculty_inter = select_or_zero(soup, '.inter .text .number')
    student_total = select_or_zero(soup, '.student .number')
    student_inter = select_or_zero(soup, '.inter .progress-bar-info .number')
    return faculty_total, faculty_inter, student_total, student_inter

def prepare_uni_data_tu(uni):
    uni_infos = dict()
    uni_infos['Name'] = normalize_name(uni['title'])
    uni_infos['Rank Top Universities'] = clean_number(uni['rank_display'])
    uni_infos['Country'] = uni['country'].strip()
    uni_infos['Region'] = uni['region'].strip()
    uni_infos['Total faculty members'], \
    uni_infos['International faculty members'], \
    uni_infos['Total students'], \
    uni_infos['International students'] = get_stats(uni['url'])
    return uni_infos

res = get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508338164061', 'html')
json_ranking_tu = json.loads(res.text)['data']
json_ranking_tu_top = sorted(json_ranking_tu, key=lambda k: clean_number(k['rank_display']))[:200]
df_tu = pd.DataFrame(list(map(prepare_uni_data_tu,json_ranking_tu_top)))

In [198]:
df_tu = pd.read_pickle('df_tu.xz')

In [199]:
df_tu['Name'] = df_tu['Name'].apply(normalize_name)
df_tu['Name'].apply(normalize_name)

0                  Massachusetts Institute of Technology
1                                    Stanford University
2                                     Harvard University
3                     California Institute of Technology
4                                University of Cambridge
5                                   University of Oxford
6                                                    UCL
7                                Imperial College London
8                                  University of Chicago
9      ETH Zurich - Swiss Federal Institute of Techno...
10           Nanyang Technological University, Singapore
11              Ecole Polytechnique Fédérale de Lausanne
12                                  Princeton University
13                                    Cornell University
14                      National University of Singapore
15                                       Yale University
16                              Johns Hopkins University
17                             

## Times Higher Education

In [215]:
def complete_with_tu(name):
    global i
    for uni in json_ranking_tu:
        if normalize_name(uni['title']) == name:
            prepare_uni_data_tu(uni)


In [216]:
def prepare_uni_data_the(uni):
    uni_infos = dict()
    uni_infos['Name'] = normalize_name(uni['name'])
    uni_infos['Rank Time Higher Education'] = clean_number(uni['rank'])
    return uni_infos
res = get('https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json', 'html')
json_ranking_the = json.loads(res.text)['data']
json_ranking_the = sorted(json_ranking_the, key=lambda k: clean_number(k['rank']))[:200]
df_the = pd.DataFrame(list(map(prepare_uni_data_the,json_ranking_the)))
df = df_tu.merge(df_the, 'outer', on='Name')
df[df['Country'].isnull()]['Name'].apply(complete_with_tu)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


200    None
201    None
202    None
203    None
204    None
205    None
206    None
207    None
208    None
209    None
210    None
211    None
212    None
213    None
214    None
215    None
216    None
217    None
218    None
219    None
220    None
221    None
222    None
223    None
224    None
225    None
226    None
227    None
228    None
229    None
       ... 
234    None
235    None
236    None
237    None
238    None
239    None
240    None
241    None
242    None
243    None
244    None
245    None
246    None
247    None
248    None
249    None
250    None
251    None
252    None
253    None
254    None
255    None
256    None
257    None
258    None
259    None
260    None
261    None
262    None
263    None
Name: Name, Length: 64, dtype: object