In [4]:
from bs4 import BeautifulSoup

import requests
import pandas as pd
import re

"""
Example usage:
>>> from data_mining import *
>>> mine_data('2009', 'B1', 'Spring')
...


Will output a dict containing a global data frame
The data frame is indexed by the the year, semester, season and sciper
"""

years = {
	'2007': '978181',
	'2008': '978187',
	'2009': '978195',
	'2010': '39486325',
	'2011': '123455150',
	'2012': '123456101',
	'2013': '213637754',
	'2014': '213637922',
	'2015': '213638028',
	'2016': '355925344'
}

semesters = {
	'B1': '249108',
	'B6': '942175',
	'B6b': '2226785',
	'M1': '2230106',
	'M3': '2230128',
	'M4': '2230140',
	'PMAut': '249127',
	'PMSpr': '3781783'
}

seasons = {
    'Autumn': '2936286',
    'Spring': '2936295'
    }

indexes =  ["year", "semester", "season"]

def get_url(year, semester, season):
    """Get the url corresponding to a given year and semester"""
    y, sem, sea = years[year], semesters[semester], seasons[season]
    return 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD='+ y +'&ww_x_PERIODE_PEDAGO=' + sem + '&ww_x_HIVERETE=' + sea

def request(year, semester, season):
    """return the http request corresponding to a given year and semester"""
    url = get_url(year, semester, season)
    return requests.get(url, verify=False)

def get_soup(year, semester, season):
    """return the data soup (BeautifulSoup) corresponding to a given year and semester"""
    r = request(year, semester, season)
    data = r.text
    return BeautifulSoup(data)


def get_table(soup, year, semester, season):
    """Transform the data soup into a dict containing the data frame
    Each dict contains the section, the year, and the dataframe containing all the corresponding data
    The dataframe contains all the columns returned by the html page + the index corresponding to "indexes"
    """
    global indexes
    table = soup.html.body.table ##get to the table
    list_df  = [] ##init list of data frame
    tc = table.children ##every rows of the table
    first_row = next(tc, None)
    attrs = first_row.text.split(',') ##extract the attributes from the header row
    section = attrs[0]
    year = attrs[1]
    nb_student = int(attrs[2].split("(")[1].split(" ")[0])
    if nb_student != 0: ##next row should contain columns info (except if there is no student)
        next_row = next(tc, None) ##directly iterate our iterator the next row
        columns = indexes + list(map(lambda l: l.text, next_row.children)) ##transform the children into a list of the inner text of each children
        df = pd.DataFrame(columns=columns) ##create the data frame with the columns from this list

    for c in tc:
        t = [year, semester, season] + list(map(lambda l: l.text, c.children))[:-1] ##transform the children into a list of the inner text of each children (corresponding here to each column)
        df.loc[df.shape[0]] = t ##append the data to the last dataframe created

    sciper_c = columns[-1]
    df = df.set_index(indexes + [sciper_c])

    return df

def mine_data(year, semester, season):
    soup = get_soup(year, semester, season) ##get the soup
    return get_table(soup, year, semester, season) ##process the soup

In [5]:
for year in range(2007,2017):
    mine_data(str(year), 'B1', 'Autumn')
    mine_data(str(year), 'B6', 'Spring')



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Civilité,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Filière opt.,Mineur,Statut,Type Echange,Ecole Echange
year,semester,season,No Sciper,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2014-2015,B1,Autumn,246671,Monsieur,Abate Bryan Jeremy,,,,,,Présent,,
2014-2015,B1,Autumn,249344,Madame,Abboud Magaly,,,,,,Présent,,
2014-2015,B1,Autumn,238673,Monsieur,Adler Yves-Fredricq Samuel,,,,,,Présent,,
2014-2015,B1,Autumn,246443,Monsieur,Ahmed Fares,,,,,,Présent,,
2014-2015,B1,Autumn,251759,Monsieur,Alami-Idrissi Ali,,,,,,Présent,,
2014-2015,B1,Autumn,248575,Monsieur,Albergoni Tobia,,,,,,Présent,,
2014-2015,B1,Autumn,243252,Monsieur,Alemán Ignacio Sukarno,,,,,,Présent,,
2014-2015,B1,Autumn,242096,Monsieur,Allemann Michael Kevin,,,,,,Présent,,
2014-2015,B1,Autumn,249707,Monsieur,Alvarez Seoane Esteban,,,,,,Présent,,
2014-2015,B1,Autumn,235644,Monsieur,Antelo Blanco Lucas,,,,,,Présent,,
