# 02 - Data from the Web

## Part 0 - Helpers and constants

#### Import modules

In [56]:
# Import necessary modules
import requests
from bs4 import BeautifulSoup
import re

#### Constants

In [57]:
# Base URL for isa.epfl.ch public report searches
isa_base_url = 'http://isa.epfl.ch/imoniteur_ISAP/'

#### Helpers

In [58]:
# Helper for creating search path
def isa_url(path):
    return isa_base_url + path

In [59]:
# Allow accessing dictionary items as object attributes
# https://goodcode.io/articles/python-dict-object/
class objdict(dict):
    def __getattr__(self, name):
        if name in self:
            return self[name]
        else:
            raise AttributeError("No such attribute: " + name)

    def __setattr__(self, name, value):
        self[name] = value

    def __delattr__(self, name):
        if name in self:
            del self[name]
        else:
            raise AttributeError("No such attribute: " + name)

In [60]:
# Fetch the given url using a GET requests with the supplied parameters
def get_page(url, params=None):
    r = requests.get(url, params)
    
     # Check whether the request succeeded or not
    if r.status_code is not requests.codes.ok:
        print("Something went wrong. Got status code = " + str(r.status_code))
        return None
    
    # TODO: Check Content-Length
    
    # Extract the response body
    html = r.text
    
    # Parse it with BeautifulSoup
    return BeautifulSoup(html, 'html.parser')

## Part 1 - Scraper Module

The first step is to fetch all the data from ISA.

To do this, we will have to scrape the respective frames that ISA serves for the form and then the search results. Steps are detailed below.

In [61]:
# URL of the public ISA home page.
isa_home_url = isa_url('%21gedpublicreports.htm?ww_i_reportmodel=133685247')

#### Get the form URL from the home page

In [6]:
home_soup = get_page(isa_home_url)

In [62]:
# Find the form frame
toc_frame = home_soup.find('frame', attrs={'name': 'toc'})

In [63]:
# Extract the form's URL
toc_frame_url = toc_frame.attrs['src']
toc_frame_url # Sanity check

'!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247'

In [64]:
# Repeat previous process for the form's frame
toc_url = isa_url(toc_frame_url)

#### Get the form

In [10]:
toc_soup = get_page(toc_url)

In [65]:
# Find the form itself within the page
# The form's name is a heuristic, read from the page's source
form_elem = toc_soup.find('form', attrs={'name': 'f'})

In [66]:
# Find the form's action on validation
form_action = form_elem['action']
form_action_url = isa_url(form_action)
form_action_url # Debug print

'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter'

In [67]:
# Define a payload dictionary with all the parameters expected by ISA's API endpoint

# /!\ By default we will request all possibilities from the endpoint, and then filter. /!\ #

# Example requests for specific academic or pedagogic periods are left FYI.

payload = {
    'ww_b_list': '1',
    'ww_i_reportmodel': '133685247',
    'ww_c_langue': '',
    
    # Format = html
    'ww_i_reportModelXsl': '133685270', 
    
    # Faculty = { Informatique : 249847 }
    'zz_x_UNITE_ACAD': 'Informatique',
    'ww_x_UNITE_ACAD': '249847',
    
    # Academic Period = { '' : null } if we want all possible periods
    #                 = { '2016-2017' : 355925344 } for a specific year
    'zz_x_PERIODE_ACAD': '',
    'ww_x_PERIODE_ACAD': 'null',

    
    # Pedagogic Period = { '' : null } if we want all possible periods
    #                  = { 'Bachelor semestre 1' : 249108 } for a specific period
    'zz_x_PERIODE_PEDAGO': '',
    'ww_x_PERIODE_PEDAGO': 'null',
    
    # Winter or summer semesters
    'zz_x_HIVERETE': '',
    'ww_x_HIVERETE': 'null',
    
    'dummy': 'ok'
}

#### Fill in the form and parse the search results

In [68]:
# Parse the result
form_result_soup = get_page(form_action_url, params=payload)
# form_result_soup.prettify() # Debug print

In [69]:
# Extract the links from the list of results
all_links = form_result_soup('a', class_='ww_x_GPS')
# all_links # Debug print

In [70]:
# Store the links in a dictionary. Some cleaning is applied, such as whitespace stripping
# Note : removing 'Tous' allows us to iterate and filter independently each result
#        instead of an aggregate HTML table which would be much more painful to parse
links = [
    objdict({
        'text': link.text.strip(),
        'url': link.attrs['onclick']
    }) for link in all_links if link.text.strip() != 'Tous'
]

For each link,the page loads the respective results with the following on-click action :

```
loadReport('ww_x_GPS=2021043255');return false;
```

We will use the following function ```parse_url()``` to extract the ```ww_x_GPS``` parameter expected by the API to return the desired information about the students.

In [71]:
parse_url_pattern = r"""'(\w+)=(\d+)'"""
parse_url_re = re.compile(parse_url_pattern)

def parse_url(url):
    match = re.search(parse_url_pattern, url)

    if match == None:
        print('parse_url: parsing failure: %s' % url)
        return None
    
    return objdict({
        'key': match.group(1),
        'value': match.group(2)
    })

The list itself must be parsed to extract information regarding section, academic year, pedagogic period. The format is the following :

```
Informatique, 2016-2017, Bachelor semestre 1
```

We call an element of the list an item, and it will be parsed by the following ```parse_item()``` function.

In [72]:
parse_item_pattern = r"""Informatique,\s*(?P<start_year>\d+)-(?P<end_year>\d+),\s*(?P<degree>[\w]+) semestre (?P<semester>\d)"""
parse_item_re = re.compile(parse_item_pattern)

def parse_item(item, url, payload):
    match = re.match(parse_item_re, item)
    
    # The above regex will match only the kind of records we are interested in, so we can
    # safely drop all the non-matching ones
    if match == None:
        # print('Match failed: %s ' % item)
        return None
    
    parsed_url = parse_url(url)
    
    if parsed_url == None:
        return None
    
    params = payload.copy()
    params[parsed_url.key] = parsed_url.value
    
    return objdict({
        'start_year': int(match.group('start_year')),
        'end_year': int(match.group('end_year')),
        'degree': match.group('degree'),
        'semester': int(match.group('semester')),
        'params': params
    })

# Filter all None elements from the given list
def filter_none(a_list):
    return [item for item in a_list if item != None]

In [73]:
all_items = [parse_item(link.text, link.url, payload) for link in links]

In [74]:
def is_valid_item(item):
    return item != None and item.start_year >= 2007 and item.end_year <= 2016

def filter_items(items):
    return [item for item in items if is_valid_item(item)]

In [76]:
items = filter_items(all_items)
items

[{'degree': 'Bachelor',
  'end_year': 2008,
  'params': {'dummy': 'ok',
   'ww_b_list': '1',
   'ww_c_langue': '',
   'ww_i_reportModelXsl': '133685270',
   'ww_i_reportmodel': '133685247',
   'ww_x_GPS': '71297531',
   'ww_x_HIVERETE': 'null',
   'ww_x_PERIODE_ACAD': 'null',
   'ww_x_PERIODE_PEDAGO': 'null',
   'ww_x_UNITE_ACAD': '249847',
   'zz_x_HIVERETE': '',
   'zz_x_PERIODE_ACAD': '',
   'zz_x_PERIODE_PEDAGO': '',
   'zz_x_UNITE_ACAD': 'Informatique'},
  'semester': 1,
  'start_year': 2007},
 {'degree': 'Bachelor',
  'end_year': 2008,
  'params': {'dummy': 'ok',
   'ww_b_list': '1',
   'ww_c_langue': '',
   'ww_i_reportModelXsl': '133685270',
   'ww_i_reportmodel': '133685247',
   'ww_x_GPS': '71297550',
   'ww_x_HIVERETE': 'null',
   'ww_x_PERIODE_ACAD': 'null',
   'ww_x_PERIODE_PEDAGO': 'null',
   'ww_x_UNITE_ACAD': '249847',
   'zz_x_HIVERETE': '',
   'zz_x_PERIODE_ACAD': '',
   'zz_x_PERIODE_PEDAGO': '',
   'zz_x_UNITE_ACAD': 'Informatique'},
  'semester': 2,
  'start_year':

#### Going through the years to extract the list of enrolled students

In [96]:
list_page_url = isa_url('!GEDPUBLICREPORTS.html')

def parse_student_row(tr, item, headers):
    tds = tr.find_all('td')
    texts = [td.text for td in tds]
    
    res = {}
    for i, header in enumerate(headers):
        res[header] = texts[i]
    
    item_keys = ['degree', 'semester', 'start_year', 'end_year']
    for key in item_keys:
        res[key] = item[key]
    
    return objdict(res)

def parse_students_list(page_soup, item):
    table = page_soup.find('table')
    trs = table.find_all('tr')
    (header_row, students_rows) = (trs[1], trs[2:])
        
    headers = [x.text for x in header_row.find_all('th')]
    
    return [
        parse_student_row(student_row, item, headers) for student_row in students_rows
    ]

def load_student_page(item):
    return objdict({
        'soup': get_page(list_page_url, item.params),
        'item': item
    })

students_pages = [load_student_page(item) for item in items]

students_lists = [
    parse_students_list(page.soup, page.item) for page in students_pages
]

In [97]:
def flatten_list(a_list):
    return [item for sublist in a_list for item in sublist]

student_list = flatten_list(students_lists)

In [98]:
import pandas as pd

In [423]:
students_data = pd.DataFrame(student_list)
students_data = students_data[['No Sciper', 'Civilité', 'Nom Prénom', 'degree', 'semester', 'start_year', 'end_year']]
students_data = students_data.rename(columns={'No Sciper': 'sciper', 'Civilité': 'title', 'Nom Prénom': 'name'})
students_data.sample(10)

Unnamed: 0,sciper,title,name,degree,semester,start_year,end_year
3941,205434,Monsieur,Briant Florian Marcel,Bachelor,6,2012,2013
7135,202293,Monsieur,Aiulfi Loris Sandro,Bachelor,5,2015,2016
2059,186942,Monsieur,Amrani Ismaïl,Bachelor,4,2010,2011
5864,224552,Monsieur,Monnin Lucas,Bachelor,3,2014,2015
1625,161220,Monsieur,Tourino Pablo,Master,1,2009,2010
3618,228496,Monsieur,Mizraji Thomas,Bachelor,2,2012,2013
5701,250134,Monsieur,Favre-Bulle Cyril Josué,Bachelor,2,2014,2015
1371,194971,Monsieur,Poiffaut Romain,Bachelor,3,2009,2010
3782,217439,Monsieur,Emery Timothée,Bachelor,4,2012,2013
5773,247277,Monsieur,Polit Chavez John Teddy,Bachelor,2,2014,2015


In [501]:
def filter_bsc_students(df):
    bsc_students = df[df['degree'] == 'Bachelor']

    to_remove_idx = ~bsc_students['semester'].isin([1, 6])
    to_remove = bsc_students[to_remove_idx].set_index('sciper')

    return bsc_students.set_index('sciper').drop(to_remove)

bsc = filter_bsc_students(students_data).reset_index().set_index(['sciper']).sort_index()
bsc

d = bsc.copy()
semester_count = d.reset_index()[['sciper', 'semester']].groupby('sciper').count().rename(columns={'semester': 'semester_count'})
with_count = d.merge(semester_count, left_index=True, right_index=True)
with_count.head(10)

Unnamed: 0_level_0,title,name,degree,semester,start_year,end_year,semester_count
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
147008,Monsieur,Good Xavier,Bachelor,3,2009,2010,6
147008,Monsieur,Good Xavier,Bachelor,5,2010,2011,6
147008,Monsieur,Good Xavier,Bachelor,6,2010,2011,6
147008,Monsieur,Good Xavier,Bachelor,4,2009,2010,6
147008,Monsieur,Good Xavier,Bachelor,1,2008,2009,6
147008,Monsieur,Good Xavier,Bachelor,2,2008,2009,6
153762,Monsieur,Conus Johann,Bachelor,6,2007,2008,2
153762,Monsieur,Conus Johann,Bachelor,5,2007,2008,2
154157,Madame,Andriambololona Riana Miarantsoa,Bachelor,5,2007,2008,1
159516,Monsieur,Raja Yanick,Bachelor,5,2007,2008,2


In [510]:
abc = with_count[['title', 'name', 'semester_count']].reset_index().groupby('sciper').agg('first')
abc.sample(10)

Unnamed: 0_level_0,title,name,semester_count
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
167845,Monsieur,Del Colle Grégory,6
223670,Monsieur,Lièvre Loïc Mathieu Pascal,2
211826,Monsieur,Scheidegger Fabrice Didier,1
234698,Monsieur,Allemand Adrien,1
205434,Monsieur,Briant Florian Marcel,8
257569,Madame,Dakir Kenza,2
185991,Monsieur,Benhamadi Yassine,8
218656,Monsieur,Videla André,7
195994,Madame,Khoshi Maryam,3
217059,Monsieur,Tanoh Kevin Matteo Afing,8


In [521]:
by_title = abc.groupby('title')
by_title['semester_count'].mean()

title
Madame      3.486726
Monsieur    4.168074
Name: semester_count, dtype: float64