# Scraping and displaying international information

## Gather the QS ranking data

In [None]:
import pandas as pd

In [None]:
# Here using code from stackoverflow user 10987432/paul-m, adapted to scrape all the component data from QS 2022
# How do we extract different years? Not sure yet...

In [None]:
def get_entries():
    import requests

    url = "https://www.topuniversities.com/sites/default/files/qs-rankings-data/en/3740566_indicators.txt?1637817445?v=1637823042256"

    headers = {
        "user-agent": "Mozilla/5.0",
        "x-requested-with": "XMLHttpRequest"
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    def make_pretty(entry):
        from bs4 import BeautifulSoup as Soup
        return {
            "name": Soup(entry["uni"], "html.parser").select_one(".uni-link").get_text(strip=True),
            "region": entry["region"],
            "location": entry["location"],
            "city": entry["city"],
            "rank": entry["overall_rank"],
            "overallscore": Soup(entry["overall"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
            "internationalfacultyratio": Soup(entry["ind_18"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
            "academicreputation": Soup(entry["ind_76"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
            "citations": Soup(entry["ind_73"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
            "facultystudentratio": Soup(entry["ind_36"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
            "employerrep": Soup(entry["ind_77"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
            "internationalstudentratio": Soup(entry["ind_14"], "html.parser").select_one(".td-wrap-in").get_text(strip=True)
        }

    yield from map(make_pretty, response.json()["data"])

#def main():
    
#    from itertools import islice

    #for entry in islice(get_entries(), 5):
        #print(entry)
    
#    return 0


#if __name__ == "__main__":
#    import sys
#    sys.exit(main())

In [None]:
# This runs the above scrape and puts the data into a pandas dataframe:
qs2022 = pd.DataFrame(get_entries())

In [None]:
# There are 1300 institutions in the 2022 rankings:
len(qs2022)

In [None]:
# The data (first and last few rows) looks like this - overall scores plus individual components measures included, as well as
# institute name, region (=continent), location (=country) and city
qs2022

In [None]:
# I want to clean the data a little bit
# (i) keep the name of institution only up to the first comma or bracket, to avoid issues later
# (ii) make a new column "place", which is "city, country". This is for the geographic data - for most institutions I can find
# geographical, but maybe not for those written in Arabic etc; "city" is not enough, as e.g., both U Cambridge and MIT are in 
# "Cambridge", so need country to distinguish

In [None]:
# (i) only keep names up to the first comma or bracket
qs2022['name'] = qs2022.name.apply(lambda x: x.split(',')[0])
qs2022['name'] = qs2022.name.apply(lambda x: x.split('(')[0])
# (ii) make new "place" column
qs2022['place'] = qs2022.city+', '+qs2022.location

In [None]:
qs2022

In [None]:
# Now we add the location data in latitude, longitude form, using Nominatim.
# This first tries to find the location from the "name" of institution, but if this fails (rare, but happens), it 
# uses the "place"

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="StackOverFlow", timeout=3)

In [None]:
# This takes a long time to look up all 1300 locations so don't run it every time!

#locations = []
#for i in range(len(qs2022)):
#    location_from_nominatim = geolocator.geocode(qs2022['name'][i], addressdetails=True, language='en')
#    if location_from_nominatim!=None:
#        locations.append(location_from_nominatim)
#    else:
#        locations.append(geolocator.geocode(qs2022['place'][i], addressdetails=True, language='en'))
        
# qs2022["location_info"] = locations 


In [None]:
# I think I had to fix a single entry manually...

# qs2022.at[294,'location_info'] = geolocator.geocode('xian jiaotong university', addressdetails=True, language='en')

In [None]:
# For ease later we'll make separate columns for latitude and longitude

# qs2022['latitude'] = qs2022.location_info.apply(lambda x: x.raw['lat'])
# qs2022['longitude'] = qs2022.location_info.apply(lambda x: x.raw['lon'])

In [None]:
# Because it takes a long time, here I just save the data to a csv file so I can read it in again

#qs2022.to_csv('qs2022_data.csv')

In [None]:
qs2022data = pd.read_csv('qs2022_data.csv')

In [None]:
# So here is the QS data with latitude and longitude data for each institution

qs2022data

In [None]:
# now we have latitudes and longitudes for every university in the QS2022 rankings, along with all the QS ranking data

## Plotting the data on a map

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

In [None]:
# Basemap has a bunch of map projections and other basic data preloaded. 

In [None]:
# In case we need it I downloaded a list of countries with latitude and longitude data, but I don't think it's necessary
# I don't know what "importance" means either!
countries = "countries.csv"
country_data = pd.read_csv(countries)

In [None]:
country_data

In [None]:
fig = plt.figure(figsize=(12,9))

In [None]:
# We can use Basemap to draw a basic map of the world, choosing a particular projection, e.g. choose one of

#m = Basemap(projection = 'mill', llcrnrlat = -90, urcrnrlat = 90, llcrnrlon = -180, urcrnrlon = 180, resolution = 'c')
#m = Basemap(projection = 'moll', lon_0 = 0, resolution = 'c')
#m = Basemap(projection='gall',llcrnrlat=-60,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,resolution='c')
m = Basemap(projection='mill',llcrnrlat=-60,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,resolution='c')

In [None]:
# Here is just coastlines and country/state borders

m.drawcoastlines()
m.drawcountries(color='gray')
m.drawstates(color='gray')
plt.show()

In [None]:
# We can add to this the location of all 1300 QS institutions:

m.drawcoastlines()
m.drawcountries(color='gray')
m.drawstates(color='gray')

# creating variable for latitude and longitude to list
lat = [float(x) for x in qs2022data['latitude'].tolist()]
lon = [float(x) for x in qs2022data['longitude'].tolist()]

# plotting the QS locations
m.scatter(lon, lat, latlon = True, s = 10, c = 'red', marker = 'o', alpha = 1)

plt.show()

In [None]:
# Or just plot the locations of the QS top 20 institutions:

m.drawcoastlines()
m.drawcountries(color='gray')
m.drawstates(color='gray')

qsdata_selection = pd.DataFrame.copy(qs2022data.sort_values(by='rank')[:20])

# creating variable for latitude and longitude to list
lat = [float(x) for x in qsdata_selection['latitude'].tolist()]
lon = [float(x) for x in qsdata_selection['longitude'].tolist()]

# plotting the QS locations
m.scatter(lon, lat, latlon = True, s = 50, c = 'red', marker = 'o', alpha = 1)

plt.show()

In [None]:
# Or top 20 by citations - but be careful because by default values are sorted
# in descending order (which is correct for rankings), so this is actually the
# *bottom* 20 by citations:

m.drawcoastlines()
m.drawcountries(color='gray')
m.drawstates(color='gray')

qsdata_selection = pd.DataFrame.copy(qs2022data.sort_values(by='citations')[:20])

# creating variable for latitude and longitude to list
lat = [float(x) for x in qsdata_selection['latitude'].tolist()]
lon = [float(x) for x in qsdata_selection['longitude'].tolist()]

# plotting the QS locations
m.scatter(lon, lat, latlon = True, s = 50, c = 'red', marker = 'o', alpha = 1)

plt.show()

In [None]:
# Remember to add asecnding=False to reverse the order and get the top 20 when
# looking at citations (or indeed anything ordered by score rather than ranking):

m.drawcoastlines()
m.drawcountries(color='gray')
m.drawstates(color='gray')

qsdata_selection = pd.DataFrame.copy(qs2022data.sort_values(by='citations',ascending=False)[:20])

# creating variable for latitude and longitude to list
lat = [float(x) for x in qsdata_selection['latitude'].tolist()]
lon = [float(x) for x in qsdata_selection['longitude'].tolist()]

# plotting the QS locations
m.scatter(lon, lat, latlon = True, s = 50, c = 'red', marker = 'o', alpha = 1)

plt.show()

In [None]:
# Basemap can also draw much prettier maps, but takes much longer!:

In [None]:
# setup Lambert Conformal basemap.
# set resolution=None to skip processing of boundary datasets.
m = Basemap(width=12000000,height=9000000,projection='lcc',
            resolution=None,lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
m.shadedrelief()
plt.show()

## Plotting some local (EPS) data

In [None]:
# This is a file of 2022/23 Research Mobility Fund awards in EPS - awardees, School, (main) partner, title, amount

rmf = "EPS_ISF_details.csv"
rmfdata = pd.read_csv(rmf)

In [None]:
rmfdata

In [None]:
# As before we can add geographical location data:

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="StackOverFlow", timeout=3)
rmfdata['location_info'] = rmfdata.Partner.apply(lambda x: geolocator.geocode(x, addressdetails=True, language='en'))
rmfdata['latitude'] = rmfdata.location_info.apply(lambda x: x.raw['lat'])
rmfdata['longitude'] = rmfdata.location_info.apply(lambda x: x.raw['lon'])

In [None]:
# And as before, I've saved this data now so I don't have to look it up again
# rmfdata.to_csv('eps_rmf_data_locations.csv')
rmfdata_locations = pd.read_csv('eps_rmf_data_locations.csv')

In [None]:
rmfdata_locations

In [None]:
# Trying to plot this with labels, but looks pretty horrible!
 

fig = plt.figure(figsize=(20,16))
m = Basemap(projection='mill',llcrnrlat=-60,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,resolution='c')
m.drawcoastlines()
m.drawcountries(color='gray')
m.drawstates(color='gray')

# creating variable for latitude and longitude to list
lat = [float(x) for x in rmfdata['latitude'].tolist()]
lon = [float(x) for x in rmfdata['longitude'].tolist()]

# plotting the map
m.scatter(lon, lat, latlon = True, s = 30, c = 'red', marker = 'o', alpha = 1)

for i, txt in enumerate(rmfdata['Partner'].tolist()):
#    plt.annotate(txt, m(lon[i], lat[i]),xycoords='data',xytext=(0, -10), textcoords='offset points', color='r', arrowprops=dict(arrowstyle="fancy", color='g'))
    plt.annotate(txt, m(lon[i], lat[i]),xycoords='data',xytext=(0, -10), textcoords='offset points', color='b')

#m = Basemap(width=12000000,height=9000000,projection='lcc',resolution=None,lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
#m.shadedrelief()

plt.show()

In [None]:
# Messing around with global plots and labels...

from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt


map = Basemap(projection='ortho', 
              lat_0=0, lon_0=0)

map.drawmapboundary(fill_color='aqua')
map.fillcontinents(color='coral',lake_color='aqua')
map.drawcoastlines()


x, y = map(2, 41)
x2, y2 = (-90, 10)

plt.annotate('Barcelona', xy=(x, y),  xycoords='data',
                xytext=(x2, y2), textcoords='offset points',
                color='r',
                arrowprops=dict(arrowstyle="fancy", color='g')
                )

x2, y2 = map(0, 0)
plt.annotate('Barcelona', xy=(x, y),  xycoords='data',
                xytext=(x2, y2), textcoords='data',
                arrowprops=dict(arrowstyle="->")
                )
plt.show()