# 02 - Data from the Web

## Part 0 - Helpers and constants

#### Import modules

In [83]:
# Import necessary modules
import requests
from bs4 import BeautifulSoup
import re

#### Constants

In [51]:
# Base URL for isa.epfl.ch public report searches
isa_base_url = 'http://isa.epfl.ch/imoniteur_ISAP/'

#### Helpers

In [52]:
# Helper for creating search path
def isa_url(path):
    return isa_base_url + path

In [71]:
def get_page(url, params=None):
    # Use requests to GET the isa search home page
    r = requests.get(url, params)
    if r.status_code is not 200: # Check the request completed
        print("Something went wrong. Res.status_code = " + str(r.status_code))
        return None
    # Extract the HTML as text ...
    html = r.text
    # ... so it can be parsed by BeautifulSoup
    return BeautifulSoup(html, 'html.parser')

## Part 1 - Scraper Module

The first step is to fetch all the data from ISA.

To do this, we will have to scrape the respective frames that ISA serves for the form and then the search results. Steps are detailed below.

In [72]:
# Full constructed url for ISA search
isa_home_url = isa_url('%21gedpublicreports.htm?ww_i_reportmodel=133685247')

#### Get the form URL from the home page

In [73]:
home_soup = get_page(isa_home_url)
# home_soup # debug print

In [74]:
# Find the form frame
toc_frame = home_soup.find('frame', attrs={'name': 'toc'})

In [75]:
# Extract the form's URL
toc_frame_url = toc_frame.attrs['src']
toc_frame_url # Sanity check

'!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247'

In [76]:
# Repeat previous process for the form's frame
toc_url = isa_url(toc_frame_url)

#### Get the form

In [77]:
toc_soup = get_page(toc_url)
# toc_soup # debug print

In [78]:
# Find the form itself within the page
form_elem = toc_soup.find('form', attrs={'name': 'f'}) # The form's name is a heuristic, read from the page's source
# print(form_elem.prettify()) # Debug print

In [79]:
# Find the form's action on validation
form_action = form_elem['action']
form_action_url = isa_url(form_action)
form_action_url # Debug print

'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter'

In [91]:
# Define a payload dictionary with all the parameters expected by ISA's API endpoint

# /!\ By default we will request all possibilities from the endpoint, and then filter. /!\ #

# Example requests for specific academic or pedagogic periods are left FYI.

payload = {
    'ww_b_list': '1',
    'ww_i_reportmodel': '133685247',
    'ww_c_langue': '',
    
    # Format = html
    'ww_i_reportModelXsl': '133685270', 
    
    # Faculty = { Informatique : 249847 }
    'zz_x_UNITE_ACAD': 'Informatique',
    'ww_x_UNITE_ACAD': '249847',
    
    # Academic Period = { '' : null } if we want all possible periods
    #                 = { '2016-2017' : 355925344 } for a specific year
    'zz_x_PERIODE_ACAD': '',
    'ww_x_PERIODE_ACAD': 'null',

    
    # Pedagogic Period = { '' : null } if we want all possible periods
    #                  = { 'Bachelor semestre 1' : 249108 } for a specific period
    'zz_x_PERIODE_PEDAGO': '',
    'ww_x_PERIODE_PEDAGO': 'null',
    
    # Winter or summer semesters
    'zz_x_HIVERETE': '',
    'ww_x_HIVERETE': 'null',
    
    'dummy': 'ok'
}

#### Fill in the form and parse the search results

In [100]:
# Parse the result
form_result_soup = get_page(form_action_url, params=payload)
# form_result_soup.prettify() # Debug print

In [101]:
# Extract the links from the list of results
all_links = form_result_soup('a', class_='ww_x_GPS')
# all_links # Debug print

In [103]:
# Store the links in a dictionary. Some cleaning is applied, such as whitespace stripping
# Note : removing 'Tous' allows us to iterate and filter independently each result
#        instead of an aggregate HTML table which would be much more painful to parse
links = [{'text': link.text.strip(), 'url': link.attrs['onclick']} for link in all_links if link.text.strip() != 'Tous' ]

For each link,the page loads the respective results with the following on-click action :

```
loadReport('ww_x_GPS=2021043255');return false;
```

We will use the following function ```parse_url()``` to extract the ```ww_x_GPS``` parameter expected by the API to return the desired information about the students.

In [106]:
parse_url_pattern = r"""'(\w+)=(\d+)'"""
parse_url_re = re.compile(parse_url_pattern)

def parse_url(url):
    match = re.search(parse_url_pattern, url)

    if match == None:
        print('parse_url: parsing failure: ' + url)
        return None
    
    return {
        'key': match.group(1),
        'value': match.group(2)
    }

The list itself must be parsed to extract information regarding section, academic year, pedagogic period. The format is the following :

```
Informatique, 2016-2017, Bachelor semestre 1
```

We call an element of the list an item, and it will be parsed by the following ```parse_item()``` function.

In [107]:
parse_item_pattern = r"""([^,]+),\s*(\d+)-(\d+),\s*([\w\s]+)"""
parse_item_re = re.compile(parse_item_pattern)

def parse_item(item, url, payload):
    match = re.match(parse_item_re, item)
    
    if match == None:
        print('parse_item: parsing failure: ' + item)
        return None
    
    parsed_url = parse_url(url)
    
    if parsed_url == None:
        return None
    
    params = payload.copy()
    params[parsed_url['key']] = parsed_url['value']
    
    return {
        'section': match.group(1),
        'semester': match.group(4),
        'year': {
            'start': match.group(2),
            'end': match.group(3)
        },
        'params': params
    }


In [110]:
# We can now parse the list of results and extract the final URLs to the student enrollment information
items = [parse_item(link['text'], link['url'], payload) for link in links]
# items # Debug print

#### Going through the years to extract the list of enrolled students

In [114]:
list_page_url = isa_url('!GEDPUBLICREPORTS.html')

res = get_page(list_page_url, items[0]['params'])
# res # Debug print