In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
# We use the form URL of public is academia
req = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247')

In [None]:
main = BeautifulSoup(req.text, 'html.parser')

In [None]:
print(main.body.prettify())

**We see next that the Informatique section is in a `<input>` with name `zz_x_UNITE_ACAD`**

In [None]:
unit = main.find(attrs={"name": "zz_x_UNITE_ACAD"})
print(unit.prettify())

**We see that the Informatique section has id: `249847`** TODO dynamic

In [None]:
unit_IN_id = 249847

In [None]:
period = main.find(attrs={"name": "zz_x_PERIODE_ACAD"})
print(period.prettify())

In [None]:
period_options = period.find_all("option")

In [None]:
for option in period_options:
    print(option['value'])

**We can get rid of the first one which is null**

In [None]:
period_options = period_options[1:]

In [None]:
# Create a dict mapping year period to period id 
year_dict = {option.text : option["value"] for option in period_options}
year_dict

In [None]:
semester = main.find(attrs={"name": "zz_x_PERIODE_PEDAGO"})
print(semester.prettify())

In [None]:
options = semester.find_all("option")
options

In [None]:
# Only keep options that contain "Bachelor"
bachelor_options = [option for option in options if "Bachelor" in option.text]
bachelor_options

In [None]:
# Create a dict mapping semester number to semester id 
bachelor_dict = {option.text : option["value"] for option in bachelor_options}
bachelor_dict

Now we have enough parameters to try to do a request to the request url. We found it using postman, it's a simple "get" request on `http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.bhtml`. We analysed the request and the field names are the same as the `input` names, but with leading `ww` instead of `zz`, for instance: `zz_x_UNITE_ACAD` becomes `ww_x_UNITE_ACAD`.

In [None]:
BASE_URL = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html"

In [None]:
fields = {"unit": "ww_x_UNITE_ACAD", "year": "ww_x_PERIODE_ACAD", "semester": "ww_x_PERIODE_PEDAGO"}
fields

**Let's try to do a request of the Bachelor semester 1 for 2016-2017**

In [None]:
params = {fields["unit"] : unit_IN_id, fields["year"] : year_dict["2016-2017"], fields["semester"] : bachelor_dict["Bachelor semestre 1"]}
params

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answer.text

**We get a 404: we need more fields to have a correct query. We noticed on postman that we needed three other fields, that we're going to get from the form**

In [None]:
all_inputs = main.body.find_all("input")
all_inputs_values = {inp["name"] : inp["value"] for inp in all_inputs}
print(all_inputs_values)

**We see on postman that the additional fields we need are `ww_i_reportmodel`, `ww_i_reportModelXsl.`** Let's try again

In [None]:
params["ww_i_reportmodel"] = all_inputs_values["ww_i_reportmodel"]
params["ww_i_reportModelXsl"] = all_inputs_values["ww_i_reportModelXsl"]

In [None]:
params

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answersoup = BeautifulSoup(answer.text, "html.parser")
print(answersoup.prettify())

**Even though we get a 200, this is still not the output we need. One last parameter is missing, but it is not in the form as an input. By inspecting our postman requests, we noticed that `ww_x_GPS` is included.**

In [None]:
# For now we just put it to -1
params["ww_x_GPS"] = -1

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answersoup = BeautifulSoup(answer.text, "html.parser")
print(answersoup.prettify())

** It works!**

In [None]:
import pandas as pd

In [None]:
lines = answersoup.body.table.find_all('tr')
print("\n=============\n".join([str(line) for line in lines[:5]]))

In [None]:
lines[-1]

**We can see that all lines except the three first ones correspond to students. Let's discard the first two.**

In [None]:
header = lines[2]
lines = lines[3:]

In [None]:
header

**We are interested in "Civilité" (gender) and "No Sciper". They have index `0` and `10`**

In [None]:
df = pd.DataFrame()
items = []
for line in lines:
    td_list = line.find_all("td")
    items.append([td_list[0].text, td_list[10].text])
    

df = df.append([[item[0]] for item in items])
df.index = [item[1] for item in items]
df.columns = ["Gender"]

In [None]:
df.sample(10)

In [None]:
df.index.is_unique

**Sciper is unique, as it should.**

In [None]:
df.groupby("Gender")["Gender"].count()