In [2]:
import requests
import pandas as pd 
from bs4 import BeautifulSoup
import csv
# Pour le test statistique
import scipy.stats as stats
import math
import numpy as np

cols = ['Période académique', 'Période pédagogique', 'Civilité',
        'Nom et prénom','Sciper','Mineur','Spécialisation','Statut']
ALL_DATA = pd.DataFrame(columns=cols)

In [3]:
def arrange_student(student_tags) : 
    
    student = {}
    
    if(len(student_tags)!=0) : 
        
        student = {}
    
        student['Civilité'] = student_tags[0].contents[0]
        student['Nom et prénom'] = student_tags[1].contents[0]
        student['Sciper'] = student_tags[10].contents[0]
        student['Statut'] = student_tags[7].contents[0]
        
        if(len(student_tags[4].contents)!=0) : 
            student['Spécialisation'] = student_tags[4].contents[0]
        
        if(len(student_tags[6].contents)!=0) : 
            student['Mineur'] = student_tags[6].contents[0]
            
        
    return student

In [4]:
#ACQUISITION DES FILTRES SOUS FORME DE DICTIONNAIRE : OK 

url1 = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247'
r1 = requests.get(url1)
soup1 = BeautifulSoup(r1.content,"lxml")

filters = {}

for filt in soup1.findAll("select"):
    
    filter_values = {}
        
    for option in filt.findAll("option"):
       
        if(option['value']!='null') : #juste pr éviter le premier element ki est vide
            filter_values[option.contents[0]] = option['value']
    
    filter_values[''] = 'null'
    filters[filt['name']] = filter_values

In [5]:
#URL avec lequel on va faire les requêtes et fichier csv qu'on va créer 

url2 = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?'

#Paramètres qu'on va passer aux filtres
fixed_params = {}
fixed_params['ww_x_GPS'] = '-1'
fixed_params['ww_i_reportModel'] = '133685247'
fixed_params['ww_i_reportModelXsl'] = '133685270'
fixed_params['ww_x_UNITE_ACAD'] = filters['ww_x_UNITE_ACAD']['Informatique']

#Truc ou on va stocker tt les etudiants de master quelque soit leur niveau

master_students = []

for periode_pedagogique in ['Master semestre 1','Master semestre 2','Master semestre 3',
                            'Projet Master automne','Projet Master printemps'] :

    for periode_academique in filters['ww_x_PERIODE_ACAD'].keys() : 
        
        if(periode_academique != '') :
    
            parameters = fixed_params.copy()
            parameters['ww_x_PERIODE_PEDAGO'] = filters['ww_x_PERIODE_PEDAGO'][periode_pedagogique]
            parameters['ww_x_PERIODE_ACAD'] = filters['ww_x_PERIODE_ACAD'][periode_academique]        
            
            req = requests.get(url2,params=parameters)
            soup = BeautifulSoup(req.content,"lxml")
            
            
            for row in soup.find('table').contents[2:] : #on commence 2ème ligne pasque les premières c'est des headers
       
                s = arrange_student(row.findAll('td'))
                s['Période académique'] = periode_academique
                s['Période pédagogique'] = periode_pedagogique
                master_students.append(s)

In [6]:
for student in master_students : 
    ALL_DATA = ALL_DATA.append(pd.Series(student), ignore_index=True)
# We drop all rows that are present more than once
ALL_DATA=ALL_DATA = ALL_DATA.drop_duplicates()


In [7]:
print(ALL_DATA.shape)

(2844, 8)


In [13]:
# Sorted by name
ALL_DATA=ALL_DATA.sort_values('Nom et prénom')

# Reindexing with names
NEW_ALL_DATA=ALL_DATA.set_index(['Nom et prénom'])

# Have a look at the current Dataframe
NEW_ALL_DATA.head(50)

Unnamed: 0_level_0,Période académique,Période pédagogique,Civilité,Sciper,Mineur,Spécialisation,Statut
Nom et prénom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abbadi Hajar,2013-2014,Master semestre 2,Madame,238067,"Mineur en Management, technologie et entrepren...",,Présent
Abbadi Hajar,2014-2015,Master semestre 3,Madame,238067,"Mineur en Management, technologie et entrepren...",,Présent
Abbadi Hajar,2013-2014,Master semestre 1,Madame,238067,,,Présent
Abelenda Diego,2011-2012,Master semestre 2,Monsieur,170646,,,Présent
Abelenda Diego,2011-2012,Master semestre 3,Monsieur,170646,,,Présent
Abelenda Diego,2010-2011,Master semestre 1,Monsieur,170646,,"Signals, Images and Interfaces",Présent
Abelenda Diego,2010-2011,Master semestre 2,Monsieur,170646,,"Signals, Images and Interfaces",Présent
Abi Akar Nora,2015-2016,Master semestre 1,Madame,251253,,,Présent
Abi Akar Nora,2016-2017,Master semestre 3,Madame,251253,,,Présent
Abi Akar Nora,2015-2016,Master semestre 2,Madame,251253,,,Présent


In [14]:
NEW_ALL_DATA['Période pédagogique'].value_counts()

Master semestre 2          1062
Master semestre 1           948
Master semestre 3           706
Projet Master automne        86
Projet Master printemps      42
Name: Période pédagogique, dtype: int64

On the above table, we can already see that if Mineur field is different than "NaN", the corresponding student is registered in " Master Semester 3"

For the time lenght of the master, we have some assumptions:

1- By "time a master student spent at epfl", we understand the time during which the student is registered on the EPFL tables without regard to the Status "présent", "congé"(absent) or "Stage"(internship).

1- We don't consider the students whose name appears once, it means they didn't achieve their master.

2- For all student, we assume that each line counts for 6 months. We have treated all the cases since we have in the dataFrame "NEW_ALL_DATA" all kind of "Période pédagogique": Master semestre 1, 2, 3, Projet Master automne and printemps.


In [9]:
cols_result = ['Name','Sex','First year of master', 'Last year of master', 'Time_Master (months)','Spécialisation','Mineur']
result = pd.DataFrame(columns=cols_result)

# For each student
for name in NEW_ALL_DATA.index.drop_duplicates() :
    
    # Keep only student who appears more than once.
    if(NEW_ALL_DATA.loc[name].shape[0]!=7):
        
        temp_frame = NEW_ALL_DATA.loc[name]
        t = temp_frame.shape[0]*6
        
        temp = {'Name' : name,'Sex' : str(NEW_ALL_DATA.loc[name]['Civilité'].values[0]) ,'First year of master' :temp_frame['Période académique'].min(),
                'Last year of master' :temp_frame['Période académique'].max() ,'Time_Master (months)' : t,
                'Spécialisation': str(NEW_ALL_DATA.loc[name]['Spécialisation'].values[0]),'Mineur': str(NEW_ALL_DATA.loc[name]['Mineur'].values[0])}
        result = result.append(temp, ignore_index=True)


In [12]:
# Print the DataFrame with the master lenght in months computed

result.head()

Unnamed: 0,Name,Sex,First year of master,Last year of master,Time_Master (months),Spécialisation,Mineur
0,Abbadi Hajar,Madame,2013-2014,2014-2015,18.0,,"Mineur en Management, technologie et entrepren..."
1,Abelenda Diego,Monsieur,2010-2011,2011-2012,24.0,,
2,Abi Akar Nora,Madame,2015-2016,2016-2017,18.0,,
3,Aeberhard François-Xavier,Monsieur,2007-2008,2009-2010,36.0,Internet computing,
4,Aeby Prisca,Madame,2015-2016,2016-2017,18.0,,


### Average stay at EPFL

In [10]:
result.mean()[0]

19.977099236641223

We only keep students who have a spécialisation in a new DataFram 'result_spe'

In [11]:
result_spe = result[result.Spécialisation != 'nan']

In [12]:
result_spe['Spécialisation'].value_counts()

Internet computing                67
Foundations of Software           48
Signals, Images and Interfaces    20
Computer Engineering - SP         17
Software Systems                  15
Information Security - SP          5
Biocomputing                       5
Data Analytics                     3
Service science                    1
Computer Science Theory            1
Internet Information Systems       1
Name: Spécialisation, dtype: int64

In [13]:
result_spe.groupby('Spécialisation').mean()

Unnamed: 0_level_0,Time_Master (months)
Spécialisation,Unnamed: 1_level_1
Biocomputing,21.6
Computer Engineering - SP,21.882353
Computer Science Theory,18.0
Data Analytics,16.0
Foundations of Software,23.125
Information Security - SP,20.4
Internet Information Systems,18.0
Internet computing,22.38806
Service science,18.0
"Signals, Images and Interfaces",24.6


Searching the spécialisation for which the difference with the average stay at EPFL is statistically significant.

For spécialisation subset with only few students, the test is not interesting.

In [14]:
# For all spécialisation
for spe in result_spe['Spécialisation'].drop_duplicates():
    
    # We keep only the values of student in the specialisation
    results_ = np.array(result_spe[result_spe.Spécialisation==spe]['Time_Master (months)'])
    print('For the specialisation : ', spe, 'here is the result of the test')
    if(len(results_)==1 or len(results_)==2 or len(results_)==3):
        print(' NOT PERTINENT TEST (Not enough students ...) \n')
    
    print('taille de l échantillon =', len(results_) )
    print('p_value = ',stats.ttest_1samp(a= results_, popmean= result.mean()[0])[1])
    if(stats.ttest_1samp(a= results_, popmean= result.mean()[0])[1] <= 0.05):
        print('We reject the null hypothesis: the subset of student coming from the specialisation', spe, 
                  ' come from a different distribution as all the students \n')
    else:
        print('We accept the null hypothesis: the subset of student coming from the specialisation', spe, 
              ' come from the same distribution as all the students \n')
            
        

For the specialisation :  Internet Information Systems here is the result of the test
 NOT PERTINENT TEST (Not enough students ...) 

taille de l échantillon = 1
p_value =  nan
We accept the null hypothesis: the subset of student coming from the specialisation Internet Information Systems  come from the same distribution as all the students 

For the specialisation :  Information Security - SP here is the result of the test
taille de l échantillon = 5
p_value =  0.868691474497
We accept the null hypothesis: the subset of student coming from the specialisation Information Security - SP  come from the same distribution as all the students 

For the specialisation :  Signals, Images and Interfaces here is the result of the test
taille de l échantillon = 20
p_value =  0.00317644878982
We reject the null hypothesis: the subset of student coming from the specialisation Signals, Images and Interfaces  come from a different distribution as all the students 

For the specialisation :  Internet 



We reject the null hypothesis: the subset of student coming from the specialisation Foundations of Software  come from a different distribution as all the students 

For the specialisation :  Biocomputing here is the result of the test
taille de l échantillon = 5
p_value =  0.755897130619
We accept the null hypothesis: the subset of student coming from the specialisation Biocomputing  come from the same distribution as all the students 

For the specialisation :  Computer Engineering - SP here is the result of the test
taille de l échantillon = 17
p_value =  0.207341578891
We accept the null hypothesis: the subset of student coming from the specialisation Computer Engineering - SP  come from the same distribution as all the students 

For the specialisation :  Data Analytics here is the result of the test
 NOT PERTINENT TEST (Not enough students ...) 

taille de l échantillon = 3
p_value =  0.185070579504
We accept the null hypothesis: the subset of student coming from the specialisati

# Question Bonus

In [15]:
#Time master average in months by sex: 

result_by_sex = result.groupby('Sex')
result_by_sex.mean()


Unnamed: 0_level_0,Time_Master (months)
Sex,Unnamed: 1_level_1
Madame,20.0
Monsieur,19.973568


Computing arrays for Men/Women Time master values.

In [16]:
# Array with the Time_master (months) values for men
men = np.array(result[result.Sex=='Monsieur']['Time_Master (months)'])

# Array with the Time_master (months) values for women
women = np.array(result[result.Sex=='Madame']['Time_Master (months)'])


Doing a Two Sample T-test assuming that both samples (men and women) do not have the same variance. In effect, we don't have information about that.

The null hypothesis is that the mean of both groups are the same. We choose a significant level of 0.05.

In [17]:
stats.ttest_ind(a= men,
                b= women,
                equal_var=False) 

Ttest_indResult(statistic=-0.042945428864118659, pvalue=0.96580682435779752)

We accept, from far, the null hypothesis. Then we have prooved that both means are the same

In [18]:
import matplotlib.pyplot as plt
from pylab import*

In [19]:
women_means = result[result.Sex=='Madame'].groupby(['First year of master']).mean()
men_means = result[result.Sex=='Monsieur'].groupby(['First year of master']).mean()

In [21]:
x = np.linspace(2007,2015, num=9)
y_men = men_means.reset_index()['Time_Master (months)'].values
y_women = women_means.reset_index()['Time_Master (months)'].values

In [23]:
fig, axes = plt.subplots()
axes.plot(x,y_men, label="Men")
axes.plot(x,y_women, label = 'Women')
axes.legend(loc=1)
axes.set_ylabel('Average duration of Master')
axes.set_xlabel('First year of master')
axes.set_title('Average duration of Master according to the time ')
fig.savefig("image.png")

<img src="image.png">

We can observe on the plot a big similarity between the two curves and then confirm what we have statistically prooved.