# 02 - Data from the Web


## You will never guess how long it takes EPFL students to complete their degree!

## Question 2 will shock you!

---

## Part 0 - Helpers and constants

#### Import modules

In [1]:
import requests

from bs4 import BeautifulSoup

import re

import pickle

import pandas as pd

#### Constants

In [2]:
# Base URL for isa.epfl.ch public report searches
isa_base_url = 'http://isa.epfl.ch/imoniteur_ISAP/'

#### Helpers

In [3]:
# Helper for creating search path
def isa_url(path):
    return isa_base_url + path

In [4]:
# Allow accessing dictionary items as object attributes
# https://goodcode.io/articles/python-dict-object/
class objdict(dict):
    def __getattr__(self, name):
        if name in self:
            return self[name]
        else:
            raise AttributeError("No such attribute: " + name)

    def __setattr__(self, name, value):
        self[name] = value

    def __delattr__(self, name):
        if name in self:
            del self[name]
        else:
            raise AttributeError("No such attribute: " + name)

In [5]:
# Fetch the given url using a GET requests with the supplied parameters
# and return a BeautifulSoup object
def get_page(url, params=None):
    r = requests.get(url, params)
    
     # Check whether the request succeeded or not
    if r.status_code is not requests.codes.ok:
        print("Something went wrong. Got status code = %d" % r.status_code)
        return None
    
    # Make sure we actually got some content back, as ISA will return a 200 status code but no content
    # when the supplied parameters are wrong.
    if int(r.headers['Content-Length']) == 0:
        print("Something went wrong. Got Content-Length: 0")
        return None
    
    # Extract the response body
    html = r.text
    
    # Parse it with BeautifulSoup
    return BeautifulSoup(html, 'html.parser')

## Part 1 - Scraper Module

The first step is to fetch all the data from ISA.

To do this, we will have to scrape the respective frames that ISA serves for the form and then the search results. Steps are detailed below.

In [6]:
# URL of the public ISA home page.
isa_home_url = isa_url('%21gedpublicreports.htm?ww_i_reportmodel=133685247')

#### Get the right frame URL from the home page

In [7]:
# Get the home page content
home_soup = get_page(isa_home_url)

In [8]:
# Find the frame containing the form
toc_frame = home_soup.find('frame', attrs={'name': 'toc'})

In [9]:
# Extract the frame's URL
toc_frame_url = toc_frame['src']
toc_frame_url

'!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247'

#### Let's now get the form itself

In [10]:
toc_url = isa_url(toc_frame_url)

In [11]:
toc_soup = get_page(toc_url)

In [12]:
# Find the form itself within the page.
# The form's name is a heuristic, read from the page's source.
form_elem = toc_soup.find('form', attrs={'name': 'f'})

In [13]:
# Find the form's action on validation
form_action = form_elem['action']
form_action_url = isa_url(form_action)
form_action_url

'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter'

### Manually craft the same request submitting the form would send to the server

The requests below corresponds to:

**Format:** html  
**Unité academique:** Informatique  
**Période académique:** *blank*  
**Période pédagogique:** *blank*  
**Type de semestre:** *blank*  

In [14]:
# Define a payload dictionary with all the parameters expected by ISA's API endpoint

# /!\ By default we will request all possibilities from the endpoint, and then filter. /!\ #

# Example requests for specific academic or pedagogic periods are left FYI.

payload = {
    'ww_b_list': '1',
    'ww_i_reportmodel': '133685247',
    'ww_c_langue': '',
    
    # Format = html
    'ww_i_reportModelXsl': '133685270', 
    
    # Faculty = { Informatique : 249847 }
    'zz_x_UNITE_ACAD': 'Informatique',
    'ww_x_UNITE_ACAD': '249847',
    
    # Academic Period = { '' : null } if we want all possible periods
    #                 = { '2016-2017' : 355925344 } for a specific year
    'zz_x_PERIODE_ACAD': '',
    'ww_x_PERIODE_ACAD': 'null',

    
    # Pedagogic Period = { '' : null } if we want all possible periods
    #                  = { 'Bachelor semestre 1' : 249108 } for a specific period
    'zz_x_PERIODE_PEDAGO': '',
    'ww_x_PERIODE_PEDAGO': 'null',
    
    # Winter or summer semesters
    'zz_x_HIVERETE': '',
    'ww_x_HIVERETE': 'null',
    
    'dummy': 'ok'
}

#### Send the request and extract the list of links that would appear below the form had we submitted it using the browser

In [15]:
# Parse the result
form_result_soup = get_page(form_action_url, params=payload)
# form_result_soup.prettify() # Debug print

In [16]:
# Extract the links from the list of results
all_links = form_result_soup('a', class_='ww_x_GPS')
# all_links # Debug print

Store the links in a dictionary. Some cleaning is applied, such as whitespace stripping

> **Note:** We remove the first link, named 'Tous', as we will request each page individually
      in order to avoid downloading a huge HTML table, which would also be much more painful to parse.

In [17]:
links = [
    objdict({
        'text': link.text.strip(),
        'url': link.attrs['onclick']
    }) for link in all_links if link.text.strip() != 'Tous'
]

For each link,the page loads the respective results with the following on-click action :

```
loadReport('ww_x_GPS=2021043255');return false;
```

We will use the following function ```parse_url()``` to extract the ```ww_x_GPS``` parameter expected by the API to return the desired information about the students.

In [18]:
parse_url_pattern = r"""'(\w+)=(\d+)'"""
parse_url_re = re.compile(parse_url_pattern)

def parse_url(url):
    match = re.search(parse_url_pattern, url)

    if match == None:
        print('parse_url: parsing failure: %s' % url)
        return None
    
    return objdict({
        'key': match.group(1),
        'value': match.group(2)
    })

The list itself must be parsed to extract information regarding section, academic year, pedagogic period. The format is the following :

```
Informatique, 2016-2017, Bachelor semestre 1
```

We call an element of the list an item, and it will be parsed by the following ```parse_item()``` function.

In [19]:
parse_item_pattern = r"""Informatique,\s*(?P<start_year>\d+)-(?P<end_year>\d+),\s*(?P<degree>[\w]+) semestre (?P<semester>\d)"""
parse_item_re = re.compile(parse_item_pattern)

def parse_item(item, url, payload):
    match = re.match(parse_item_re, item)
    
    # The above regex will match only the kind of records we are interested in, so we can
    # safely drop all the non-matching ones
    if match == None:
        # print('Match failed: %s ' % item)
        return None
    
    parsed_url = parse_url(url)
    
    if parsed_url == None:
        return None
    
    params = payload.copy()
    params[parsed_url.key] = parsed_url.value
    
    return objdict({
        'start_year': int(match.group('start_year')),
        'end_year': int(match.group('end_year')),
        'degree': match.group('degree'),
        'semester': int(match.group('semester')),
        'params': params
    })

# Filter all None elements from the given list
def filter_none(a_list):
    return [item for item in a_list if item != None]

In [20]:
all_items = [parse_item(link.text, link.url, payload) for link in links]

In [21]:
def is_valid_item(item):
    return item != None and item.start_year >= 2007 and item.end_year <= 2017

def filter_items(items):
    return [item for item in items if is_valid_item(item)]

In [40]:
items = filter_items(all_items)


[{'degree': 'Bachelor',
  'end_year': 2008,
  'params': {'dummy': 'ok',
   'ww_b_list': '1',
   'ww_c_langue': '',
   'ww_i_reportModelXsl': '133685270',
   'ww_i_reportmodel': '133685247',
   'ww_x_GPS': '71297550',
   'ww_x_HIVERETE': 'null',
   'ww_x_PERIODE_ACAD': 'null',
   'ww_x_PERIODE_PEDAGO': 'null',
   'ww_x_UNITE_ACAD': '249847',
   'zz_x_HIVERETE': '',
   'zz_x_PERIODE_ACAD': '',
   'zz_x_PERIODE_PEDAGO': '',
   'zz_x_UNITE_ACAD': 'Informatique'},
  'semester': 2,
  'start_year': 2007},
 {'degree': 'Bachelor',
  'end_year': 2008,
  'params': {'dummy': 'ok',
   'ww_b_list': '1',
   'ww_c_langue': '',
   'ww_i_reportModelXsl': '133685270',
   'ww_i_reportmodel': '133685247',
   'ww_x_GPS': '39494788',
   'ww_x_HIVERETE': 'null',
   'ww_x_PERIODE_ACAD': 'null',
   'ww_x_PERIODE_PEDAGO': 'null',
   'ww_x_UNITE_ACAD': '249847',
   'zz_x_HIVERETE': '',
   'zz_x_PERIODE_ACAD': '',
   'zz_x_PERIODE_PEDAGO': '',
   'zz_x_UNITE_ACAD': 'Informatique'},
  'semester': 3,
  'start_year':

#### Going through the years to extract the list of enrolled students

In [23]:
list_page_url = isa_url('!GEDPUBLICREPORTS.html')

def parse_student_row(tr, item, headers):
    tds = tr.find_all('td')
    texts = [td.text for td in tds]
    
    res = {}
    for i, header in enumerate(headers):
        res[header] = texts[i]
    
    item_keys = ['degree', 'semester', 'start_year', 'end_year']
    for key in item_keys:
        res[key] = item[key]
    
    return objdict(res)

def parse_students_list(page_soup, item):
    table = page_soup.find('table')
    trs = table.find_all('tr')
    
    if len(trs) < 3:
        return []
    
    (header_row, students_rows) = (trs[1], trs[2:])
        
    headers = [x.text for x in header_row.find_all('th')]
    
    return [
        parse_student_row(student_row, item, headers) for student_row in students_rows
    ]

def load_student_page(item):
    return objdict({
        'soup': get_page(list_page_url, item.params),
        'item': item
    })

def flatten_list(a_list):
    return [item for sublist in a_list for item in sublist]

# TODO: Add logging statements above to see the progress

In [24]:
load_from_disk = True
save_path = "students_data.p"

if load_from_disk:
    student_list = pickle.load(open(save_path, 'rb'))
else:
    students_pages = [load_student_page(item) for item in items]

    students_lists = [
        parse_students_list(page.soup, page.item) for page in students_pages
    ]

    student_list = flatten_list(students_lists)
    
    # Serialize the whole data to disk
    pickle.dump(student_list, open(save_path, 'wb'))

## Part 2 - Data analysis

### Question 1

> Keep only the students for which you have an entry for both Bachelor semestre 1 and Bachelor semestre 6. Compute how many months it took each student to go from the first to the sixth semester. Partition the data between male and female students, and compute the average -- is the difference in average statistically significant?

In [25]:
students_data = pd.DataFrame(student_list)

def load_bsc_data(data):
    bsc_data = students_data[['No Sciper', 'Civilité', 'Nom Prénom', 'degree', 'semester', 'start_year', 'end_year']]
    return bsc_data.rename(columns={'No Sciper': 'sciper', 'Civilité': 'title', 'Nom Prénom': 'name'})
    
bsc_data = load_bsc_data(students_data)

In [26]:
def filter_bsc_students(bsc_data):
    bsc_students = bsc_data[bsc_data['degree'] == 'Bachelor']
    idx1 = set(bsc_students[bsc_students['semester'] == 1].set_index('sciper').index)
    idx6 = set(bsc_students[bsc_students['semester'] == 6].set_index('sciper').index)

    idx = idx1.intersection(idx6)

    return bsc_students[bsc_students['sciper'].isin(idx)].set_index('sciper').sort_index()

bsc = filter_bsc_students(bsc_data)
bsc

Unnamed: 0_level_0,title,name,degree,semester,start_year,end_year
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
147008,Monsieur,Good Xavier,Bachelor,4,2009,2010
147008,Monsieur,Good Xavier,Bachelor,1,2008,2009
147008,Monsieur,Good Xavier,Bachelor,2,2008,2009
147008,Monsieur,Good Xavier,Bachelor,3,2009,2010
147008,Monsieur,Good Xavier,Bachelor,6,2010,2011
147008,Monsieur,Good Xavier,Bachelor,5,2010,2011
169569,Monsieur,Arévalo Christian,Bachelor,3,2008,2009
169569,Monsieur,Arévalo Christian,Bachelor,4,2008,2009
169569,Monsieur,Arévalo Christian,Bachelor,2,2007,2008
169569,Monsieur,Arévalo Christian,Bachelor,5,2009,2010


In [27]:
def add_semesters_count(bsc):
    cols = ['sciper', 'semester']
    semester_count = bsc.reset_index()[cols].groupby('sciper').count().rename(columns={'semester': 'semester_count'})
    return bsc.merge(semester_count, left_index=True, right_index=True)
    
with_counts = add_semesters_count(bsc)
with_counts.head(20)

Unnamed: 0_level_0,title,name,degree,semester,start_year,end_year,semester_count
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
147008,Monsieur,Good Xavier,Bachelor,4,2009,2010,6
147008,Monsieur,Good Xavier,Bachelor,1,2008,2009,6
147008,Monsieur,Good Xavier,Bachelor,2,2008,2009,6
147008,Monsieur,Good Xavier,Bachelor,3,2009,2010,6
147008,Monsieur,Good Xavier,Bachelor,6,2010,2011,6
147008,Monsieur,Good Xavier,Bachelor,5,2010,2011,6
169569,Monsieur,Arévalo Christian,Bachelor,3,2008,2009,6
169569,Monsieur,Arévalo Christian,Bachelor,4,2008,2009,6
169569,Monsieur,Arévalo Christian,Bachelor,2,2007,2008,6
169569,Monsieur,Arévalo Christian,Bachelor,5,2009,2010,6


In [28]:
abc = with_count[['title', 'name', 'semester_count']].reset_index().groupby('sciper').agg('first')
abc.sample(10)

NameError: name 'with_count' is not defined

In [None]:
by_title = abc.groupby('title')
by_title['semester_count'].describe()

### Question 2

> Perform a similar operation to what described above, this time for Master students. Notice that this data is more tricky, as there are many missing records in the IS-Academia database. Therefore, try to guess how much time a master student spent at EPFL by at least checking the distance in months between Master semestre 1 and Master semestre 2. If the Mineur field is not empty, the student should also appear registered in Master semestre 3. Last but not the least, don't forget to check if the student has an entry also in the Projet Master tables. Once you can handle well this data, compute the "average stay at EPFL" for master students. Now extract all the students with a Spécialisation and compute the "average stay" per each category of that attribute -- compared to the general average, can you find any specialization for which the difference in average is statistically significant?

In [157]:
def load_msc_data(data):
    cols = [
        'Civilité', 'No Sciper', 'Nom Prénom', 'Statut',
        'Spécialisation', 'Mineur',
        'degree', 'semester', 'end_year', 'start_year'
    ]
    rename = {
        'Civilité': 'title',
        'No Sciper': 'sciper',
        'Nom Prénom': 'name',
        'Statut': 'status',
        'Spécialisation': 'specialisation',
        'Mineur': 'minor'
    }
    
    return students_data[cols].rename(columns=rename)
    
msc_data = load_msc_data(students_data)

def filter_msc_students(students_data):
    return students_data[(students_data['degree'] == 'Master') & (students_data['status'] == 'Présent')]

msc = filter_msc_students(msc_data).set_index('sciper').sort_index()


False

In [30]:
msc_count = add_semesters_count(msc)
msc_count[(msc_count['semester_count'] < 3) & (msc_count['minor'] != '')]

Unnamed: 0_level_0,title,name,status,specialisation,minor,degree,semester,end_year,start_year,semester_count
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
153642,Monsieur,Imhoff Lionel,Présent,,"Mineur en Management, technologie et entrepren...",Master,3,2008,2007,1
153645,Monsieur,Rosat Damien,Présent,,"Mineur en Management, technologie et entrepren...",Master,3,2008,2007,1
160391,Monsieur,Chipounov Vitaly,Présent,,"Mineur en Management, technologie et entrepren...",Master,3,2008,2007,1
172257,Monsieur,Dubut Frédéric,Présent,,"Mineur en Management, technologie et entrepren...",Master,3,2008,2007,1
199294,Monsieur,Jin Xuedong,Présent,,"Mineur en Management, technologie et entrepren...",Master,2,2011,2010,2
199294,Monsieur,Jin Xuedong,Présent,,"Mineur en Management, technologie et entrepren...",Master,3,2012,2011,2
200419,Monsieur,Jing Lifu,Présent,,"Mineur en Management, technologie et entrepren...",Master,2,2011,2010,2
203301,Monsieur,Nishida Keishi,Présent,,"Mineur en Management, technologie et entrepren...",Master,2,2011,2010,2
203301,Monsieur,Nishida Keishi,Présent,,"Mineur en Management, technologie et entrepren...",Master,1,2011,2010,2
210271,Monsieur,Zhang Daiwei,Présent,,"Mineur en Management, technologie et entrepren...",Master,1,2012,2011,2


In [31]:
msc_agg = msc_count[['title', 'name', 'minor', 'semester_count']].reset_index().groupby('sciper').agg('first')
msc_agg.sample(10)

Unnamed: 0_level_0,title,name,minor,semester_count
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
180816,Monsieur,Fond Matthieu,,2
233444,Monsieur,Castellani Mikaël Guy,,2
267543,Monsieur,Agapiou Stylianos,,1
224553,Monsieur,Quinton Pierre Victor Xavier,,1
236802,Monsieur,Bonfils Nils Pascal,,1
268618,Monsieur,Petrovski Bojan,,1
226421,Monsieur,Stucki Nicolas Alexander,,3
256143,Monsieur,Dragic Ozrenko,,2
213945,Monsieur,Cosendey Quentin,,4
155097,Monsieur,Wei Zhe,,3


In [32]:
msc_agg[msc_agg['semester_count'] >= 1].groupby('title').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,semester_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Madame,count,131.0
Madame,mean,2.503817
Madame,std,1.098069
Madame,min,1.0
Madame,25%,2.0
Madame,50%,3.0
Madame,75%,3.0
Madame,max,7.0
Monsieur,count,834.0
Monsieur,mean,2.482014


In [33]:
msc

Unnamed: 0_level_0,title,name,status,specialisation,minor,degree,semester,end_year,start_year
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
128911,Monsieur,Gulati Asheesh,Présent,Internet computing,,Master,3,2008,2007
128911,Monsieur,Gulati Asheesh,Présent,Internet computing,,Master,2,2008,2007
129093,Monsieur,Zhou Maoan,Présent,,,Master,2,2008,2007
129326,Monsieur,Ni Zhong Zhong,Présent,,,Master,3,2008,2007
129326,Monsieur,Ni Zhong Zhong,Présent,,,Master,2,2008,2007
138088,Monsieur,Droz-dit-Busset Arnault,Présent,,,Master,1,2008,2007
145546,Monsieur,Clivaz Jean-Philippe,Présent,,,Master,3,2008,2007
145957,Monsieur,Hügli Michael,Présent,,,Master,3,2008,2007
146330,Monsieur,Cardinaux Damien,Présent,,,Master,2,2008,2007
146330,Monsieur,Cardinaux Damien,Présent,,,Master,3,2009,2008


In [37]:
parse_item_pdm_pattern = r"""Informatique,\s*(?P<start_year>\d+)-(?P<end_year>\d+),\s*Projet Master\s+(?P<season>\w+)"""
parse_item_pdm_re = re.compile(parse_item_pdm_pattern)

def parse_item_pdm(item, url, payload):
    match = re.match(parse_item_pdm_re, item)
    
    # The above regex will match only the kind of records we are interested in, so we can
    # safely drop all the non-matching ones
    if match == None:
        # print('Match failed: %s ' % item)
        return None
    parsed_url = parse_url(url)
    
    if parsed_url == None:
        return None
    
    params = payload.copy()
    params[parsed_url.key] = parsed_url.value
    
    return objdict({
        'start_year': int(match.group('start_year')),
        'end_year': int(match.group('end_year')),
        'season': match.group('season'),
        'params': params
    })


all_items_pdm = [parse_item_pdm(link.text, link.url, payload) for link in links]
items_pdm = filter_items(all_items_pdm)

[{'end_year': 2008,
  'params': {'dummy': 'ok',
   'ww_b_list': '1',
   'ww_c_langue': '',
   'ww_i_reportModelXsl': '133685270',
   'ww_i_reportmodel': '133685247',
   'ww_x_GPS': '39495382',
   'ww_x_HIVERETE': 'null',
   'ww_x_PERIODE_ACAD': 'null',
   'ww_x_PERIODE_PEDAGO': 'null',
   'ww_x_UNITE_ACAD': '249847',
   'zz_x_HIVERETE': '',
   'zz_x_PERIODE_ACAD': '',
   'zz_x_PERIODE_PEDAGO': '',
   'zz_x_UNITE_ACAD': 'Informatique'},
  'season': 'automne',
  'start_year': 2007},
 {'end_year': 2008,
  'params': {'dummy': 'ok',
   'ww_b_list': '1',
   'ww_c_langue': '',
   'ww_i_reportModelXsl': '133685270',
   'ww_i_reportmodel': '133685247',
   'ww_x_GPS': '39495444',
   'ww_x_HIVERETE': 'null',
   'ww_x_PERIODE_ACAD': 'null',
   'ww_x_PERIODE_PEDAGO': 'null',
   'ww_x_UNITE_ACAD': '249847',
   'zz_x_HIVERETE': '',
   'zz_x_PERIODE_ACAD': '',
   'zz_x_PERIODE_PEDAGO': '',
   'zz_x_UNITE_ACAD': 'Informatique'},
  'season': 'printemps',
  'start_year': 2007},
 {'end_year': 2009,
  'par

In [46]:
def parse_student_row_pdm(tr, item, headers):
    tds = tr.find_all('td')
    texts = [td.text for td in tds]
    
    res = {}
    for i, header in enumerate(headers):
        res[header] = texts[i]
    
    item_keys = ['season', 'start_year', 'end_year']
    for key in item_keys:
        res[key] = item[key]
    
    return objdict(res)

def parse_students_list_pdm(page_soup, item):
    table = page_soup.find('table')
    trs = table.find_all('tr')
    
    if len(trs) < 3:
        return []
    
    (header_row, students_rows) = (trs[1], trs[2:])
        
    headers = [x.text for x in header_row.find_all('th')]
    
    return [
        parse_student_row_pdm(student_row, item, headers) for student_row in students_rows
    ]

In [42]:
pdm_students_pages = [load_student_page(item) for item in items_pdm]

pdm_students_pages

students_lists_pdm = [
    parse_students_list_pdm(page.soup, page.item) for page in pdm_students_pages
    ]

student_list_pdm = flatten_list(students_lists_pdm) 

In [45]:
student_list_pdm[1:10]

[{'Civilité': 'Monsieur',
  'Ecole Echange': '',
  'Filière opt.': '',
  'Mineur': '',
  'No Sciper': '159852',
  'Nom Prénom': 'Brutsche\xa0Florian',
  'Orientation Bachelor': '',
  'Orientation Master': '',
  'Spécialisation': 'Internet computing',
  'Statut': 'Congé',
  'Type Echange': '',
  'end_year': 2008,
  'season': 'printemps',
  'start_year': 2007},
 {'Civilité': 'Monsieur',
  'Ecole Echange': '',
  'Filière opt.': '',
  'Mineur': '',
  'No Sciper': '153819',
  'Nom Prénom': 'Dotta\xa0Mirco',
  'Orientation Bachelor': '',
  'Orientation Master': '',
  'Spécialisation': '',
  'Statut': 'Stage',
  'Type Echange': '',
  'end_year': 2008,
  'season': 'printemps',
  'start_year': 2007},
 {'Civilité': 'Monsieur',
  'Ecole Echange': '',
  'Filière opt.': '',
  'Mineur': '',
  'No Sciper': '145957',
  'Nom Prénom': 'Hügli\xa0Michael',
  'Orientation Bachelor': '',
  'Orientation Master': '',
  'Spécialisation': '',
  'Statut': 'Stage',
  'Type Echange': '',
  'end_year': 2008,
  'sea

In [239]:
students_pdm_data = pd.DataFrame(student_list_pdm)
students_pdm.sample(10)


def load_pdm_data(data):
    pdm_data = students_pdm_data[['No Sciper', 'Civilité', 'Nom Prénom', 'Mineur', 'Statut', 'start_year', 'end_year','season']]
    return pdm_data.rename(columns={'No Sciper': 'sciper', 'Civilité': 'title', 'Nom Prénom': 'name'})
    
pdm_data = load_pdm_data(students_pdm_data)

pdm_data = pdm_data.set_index('sciper').sort_index()
pdm_data.index.is_unique

False

In [240]:
pdm_data.sample(10)

Unnamed: 0_level_0,title,name,Mineur,Statut,start_year,end_year,season
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
227848,Monsieur,Majeri Kasmaei Chervine,Information security minor,Présent,2016,2017,automne
154573,Madame,Benabdallah Zeineb,,Présent,2009,2010,printemps
223102,Monsieur,Lukovics Akos,,Présent,2013,2014,automne
214200,Monsieur,Maitre Grégory Ludovic,,Présent,2016,2017,automne
159852,Monsieur,Brutsche Florian,,Congé,2007,2008,printemps
214101,Monsieur,Robert Arnaud,,Présent,2016,2017,automne
227356,Monsieur,Leiva Loris Angel,,Présent,2016,2017,automne
175702,Monsieur,Steiger Robin,,Stage,2010,2011,printemps
244136,Monsieur,Cardoso Rodrigues Bonifácio Vítor Gonçalo,"Mineur en Management, technologie et entrepren...",Présent,2016,2017,automne
244283,Madame,Zaridze Ketevani,,Présent,2016,2017,automne


Unnamed: 0,sciper,title,name,Mineur,Statut,start_year,end_year,season
0,173527,Monsieur,Stewart Conail,,Présent,2007,2008,automne
1,159852,Monsieur,Brutsche Florian,,Congé,2007,2008,printemps
2,153819,Monsieur,Dotta Mirco,,Stage,2007,2008,printemps
3,145957,Monsieur,Hügli Michael,,Stage,2007,2008,printemps
4,173257,Monsieur,Indra Saurabh,,Présent,2007,2008,printemps
5,160150,Monsieur,Lépine Simon,,Présent,2007,2008,printemps
6,173527,Monsieur,Stewart Conail,,Présent,2007,2008,printemps
7,180027,Madame,Agarwal Megha,,Stage,2008,2009,automne
8,159852,Monsieur,Brutsche Florian,,Congé,2008,2009,automne
9,166805,Monsieur,Fleury Marc-Olivier,,Stage,2008,2009,automne


In [197]:

msc_pdm = msc.reset_index().drop_duplicates(subset='sciper').set_index(['sciper'])


#msc_pdm.sample(10)
msc_pdm.index.is_unique
pdm_data.index.is_unique


pdm_data= pdm_data.reindex(msc_pdm.index).set_index(msc_pdm.index)
pdm_data.sort_index()

Unnamed: 0_level_0,sciper,title,name,Mineur,Statut,start_year,end_year,season
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
128911,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN
129093,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN
129326,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN
138088,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN
145546,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN
145957,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN
146330,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN
146441,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN
146742,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN
146752,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN


AttributeError: 'numpy.ndarray' object has no attribute 'unique'