In [None]:
from bs4 import BeautifulSoup
%matplotlib inline
import requests
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import pylab as pl
plt.style.available
plt.style.use('bmh')

sns.set_context('notebook')
"""
Example usage:
>>> from data_mining import *
>>> mine_data('2009', 'B1', 'Spring')
...


Will output a global data frame indexed by sciper
The data frame is indexed by the the year, semester and sciper
"""

years = {
	'2007': '978181',
	'2008': '978187',
	'2009': '978195',
	'2010': '39486325',
	'2011': '123455150',
	'2012': '123456101',
	'2013': '213637754',
	'2014': '213637922',
	'2015': '213638028',
	'2016': '355925344'
}

semesters = {
	'B1': '249108',
    'B5': '942120',
	'B6': '942175',
	'M1': '2230106',
    'M2': '942192',
	'M3': '2230128',
	'PMAut': '249127',
	'PMSpr': '3781783'
}

indexes =  []

def get_url(year, semester):
    """Get the url corresponding to a given year and semester"""
    y, sem = years[year], semesters[semester]
    return 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD='+ y +'&ww_x_PERIODE_PEDAGO=' + sem + '&ww_x_HIVERETE=null'

def request(year, semester):
    """return the http request corresponding to a given year and semester"""
    url = get_url(year, semester)
    return requests.get(url, verify=False)

def get_soup(year, semester):
    """return the data soup (BeautifulSoup) corresponding to a given year and semester"""
    r = request(year, semester)
    data = r.text
    return BeautifulSoup(data,"lxml")


def get_table(soup, year, semester):
    """Transform the data soup into a dict containing the data frame
    Each dict contains the section, the year, and the dataframe containing all the corresponding data
    The dataframe contains all the columns returned by the html page + the index corresponding to "indexes"
    """
    global indexes
    table = soup.html.body.table ##get to the table
    list_df  = [] ##init list of data frame
    tc = table.children ##every rows of the table
    first_row = next(tc, None)
    attrs = first_row.text.split(',') ##extract the attributes from the header row
    #section = attrs[0]
    #year = attrs[1]
    nb_student = int(attrs[2].split("(")[1].split(" ")[0])
    if nb_student != 0: ##next row should contain columns info (except if there is no student)
        next_row = next(tc, None) ##directly iterate our iterator the next row
        columns = list(map(lambda l: l.text, next_row.children)) ##transform the children into a list of the inner text of each children
        df = pd.DataFrame(columns=columns) ##create the data frame with the columns from this list

        for c in tc:
            t = list(map(lambda l: l.text, c.children))[:-1] ##transform the children into a list of the inner text of each children (corresponding here to each column)
            df.loc[df.shape[0]] = t ##append the data to the last dataframe created

        sciper_c = columns[-1]
        df = df.set_index([sciper_c, 'Nom Prénom'])
        df[semester+'_year'] = year 
    
        return df
    return pd.DataFrame()

def mine_data(year, semester):
    soup = get_soup(year, semester) ##get the soup
    return get_table(soup, year, semester) ##process the soup


In [None]:
df_bachelor =  mine_data('2007', 'B1')
df_bachelor['B5_year'] = float('NaN')
df_bachelor['B6_year'] = float('NaN') # Because nan can only be stored in floating points


for year in range(2007, 2017):
    df_b1 = mine_data(str(year), 'B1')
    #override only if B1 entry is not in the dataframe yet
    df_bachelor = df_bachelor.combine_first(df_b1)
    
    #override the date when newer semester found
    df_b6 = mine_data(str(year), 'B6')
    df_b5 = mine_data(str(year), 'B5')
    df_bachelor.update(df_b6)
    df_bachelor.update(df_b5)

In [None]:
df_bachelor.head(10)
df_bachelor.tail(10)

Now we keep the rows which have a value in B1 and B6

In [None]:
df_bachelor = df_bachelor.dropna(subset=['B1_year', 'B6_year'])
df_bachelor.head(10)

Bachelor 6 is actually in the spring semester so one year after

In [None]:
df_bachelor[['B6_year']] = df_bachelor[['B6_year']].apply(pd.to_numeric)
df_bachelor['B6_year'] = df_bachelor['B6_year'] + 1

In [None]:
df_bachelor.head(10)

In [None]:
def compute_time(df):
    #if the student finishes in the fifth semester, it has done six more months 
    if float(df['B5_year']) >= float(df['B6_year']):
        end = float(df['B5_year']) + 0.5
    #otherwise he finishes in the summer
    else:
        end = float(df['B6_year'])
    # We want to compute the total time spent in months
    # 1st semester is sep-jan, and 2nd feb-jul so count only 5 months to complete 1st.
    # hurray for nitpicking!
    time = (end - float(df['B1_year'])) * 12 - 1 
    return time

In [None]:
df_bachelor['bachelor_months'] = df_bachelor.apply(compute_time, axis=1)

In [None]:
df_bachelor.head(10)

We simply need to group by `Civilité` to get the average time per gender

In [None]:
df_bachelor["Civilité"].astype('category')
sex_grouped = df_bachelor.groupby('Civilité')['bachelor_months']
b_months = sex_grouped.apply(np.average)
b_months.to_frame(name='Months to finish bachelor').unstack().unstack().plot.bar()

In [None]:
sex_grouped.head()

## Statistical significance
In order to evaluate the statistical significance of the difference in average between the male and female students, we first examine the data as a whole, with regard to the number of months spent to get one's bachelor degree.

In [None]:
b_months_min = df_bachelor.bachelor_months.min().astype(int)
b_months_max = df_bachelor.bachelor_months.max().astype(int)
df_bachelor.hist(column='bachelor_months', bins=range(b_months_min, b_months_max+1, 6))
pl.suptitle("Time spent completing bachelor's degree")
pl.xlabel("Months")
pl.ylabel("Students")

As can be expected, most students complete the bachelor's in 36 months (the mandatory 3 years). A fair amount will need 6 or 12 months more. If we now look at the structure of the data by gender, we can see that it holds roughly the same shape in both cases.

In [None]:
df_bachelor.hist(column='bachelor_months', by=df_bachelor['Civilité'], bins=range(b_months_min, b_months_max+1, 6))
pl.suptitle("Time spent completing bachelor's degree by gender")
pl.xlabel("Months")
pl.ylabel("Students")

### Test 1

Since we are observing a dependent variable (i.e. the n° of months to complete a bachelor's degree) in two population samples determined by an independent variable (i.e. the gender of the person), we use an **Mann-Whitney rank test **, as provided in `scipy`'s `mannwhitneyu` function. We do not use the unpaired two sample t-tests because our random variables, the number of months, are not randomly sampled from a normal distribution (number of month is not even continuous).

In [None]:
values_per_gender = [gender_value for gender_name, gender_value in sex_grouped]
stat, pval = stats.mannwhitneyu(*values_per_gender) 

So in this case, we're working with the null hypothesis $H_0$ that the mean in months spent for a bachelor's degree of the two populations, male and female students in computer science, is equal. 

With our two samples, we get the following p-value:

In [None]:
pval

This is just above an $\alpha$ of 0.10. So the test indicates that we fail to reject the null Hypothesis $H_0$, i.e. that this difference in mean is *not obviously statistically significant*. But being at the limit, it is far from unlikely that the Madame takes less times to complete the Master.

### Test 2

As an additional test to avoid arbitrary inaccuracies of the Mann-Whitney rank test we used, we can try the K-S test on our two sample data distributions. This is a non-parametric test, and basically computes the max distance between the cumulative distribution functions of the two distributions. 

In [None]:
stat, pval = stats.ks_2samp(*values_per_gender)
pval

Once again, the result of the test indicates that we should accept $H_0$. So we can state that the difference in the mean value between the number of months needed by women and men is not statistically significant.

# Question 2 

Get the data

In [None]:
df_master =  mine_data('2007', 'M1')
df_master['M1_year'] = float('NaN')
df_master['M2_year'] = float('NaN')
df_master['M3_year'] = float('NaN')
df_master['PMAut_year'] = float('NaN')
df_master['PMSpr_year'] = float('NaN')
df_master['Spécialisation'] = str('NaN')

for year in range(2007, 2017):
    df_m1 = mine_data(str(year), 'M1')
    #override only if B1 is not in the dataframe yet
    df_master = df_master.combine_first(df_m1)
    
    #override the date when older semester found
    df_m2 = mine_data(str(year), 'M2')
    df_m3 = mine_data(str(year), 'M3')
    df_p1 = mine_data(str(year), 'PMAut')
    df_p2 = mine_data(str(year), 'PMSpr')
    df_master.update(df_m2)
    df_master.update(df_m3)
    df_master.update(df_p1)
    df_master.update(df_p2)

In [None]:
df_master.head(10)

Drop the ones without M1 and M2 entries

In [None]:
df_m = df_master.dropna(subset=['M1_year', 'M2_year'])

In [None]:
df_m.head(10)

To compute stay:
-if PMAut_year or PMSpr_year is not empty, then this is the last value - M1_year
-if M3_year 

# TODO delete rows with Mineur and not M3_year

In [None]:
#df_m[((df_m.Mineur != '') and (df_m.M3_year > 0)) | (df_m.Mineur == '')]

In [None]:
df_m[['M1_year','M2_year','M3_year','PMAut_year','PMSpr_year']] = df_m[['M1_year','M2_year','M3_year','PMAut_year','PMSpr_year']].apply(pd.to_numeric)

In [None]:
df_m

In [None]:
def compute_time_master(df):
    #if the student has done a master project 
    if (not pd.isnull(df['PMAut_year'])):
        end = float(df['PMAut_year']) + 0.5
    elif (not pd.isnull(df['PMSpr_year'])):
        end = float(df['PMSpr_year']) + 1
    elif (not pd.isnull(df['M3_year'])):
        end = float(df['M3_year']) + 0.5
    else:
        end = float(df['M2_year']) + 1
    time = 12*(float(end) - float(df['M1_year'])) - 1
    return time

In [None]:
df_m['master_months'] = df_m.apply(compute_time_master, axis=1)

In [None]:
df_m.head()

In [None]:
df_m["Civilité"].astype('category')
sex_grouped_master = df_m.groupby('Civilité')['master_months']
m_years = sex_grouped_master.apply(np.average)
m_years.to_frame(name='Months in master').unstack().unstack().plot.bar()

Now we look at the Spécialisation

In [None]:
df_spec = df_m[df_m['Spécialisation'] != '']

In [None]:
df_spec.head(20)

In [None]:
df_spec["Spécialisation"].astype('category')
spec_grouped_master = df_spec.groupby('Spécialisation')['master_months']
m_spec = spec_grouped_master.apply(np.average)
m_spec.to_frame(name='Months in master').unstack().unstack().plot.bar()


## Statistical significance

Once again, we want to evaluate the statistical significance of the difference in means between different populations of our data. In the case of the master's students, we look at the pairwise difference between the general mean time spent doing the master's thesis (in months) and the time spent in each specialisation.

Let's have a look at the general layout to begin with.

In [None]:
m_months_min = df_m.master_months.min().astype(int)
m_months_max = df_m.master_months.max().astype(int)
df_m.hist(column='master_months', bins=range(m_months_min, m_months_max+1, 6))
pl.suptitle("Time spent completing master's degree")
pl.xlabel("Months")
pl.ylabel("Students")

We can do the same thing for all different specialisations, just to get an idea of the layout of the data.

In [None]:
df_m.hist(column='master_months', by=df_m['Spécialisation'], bins=range(m_months_min, m_months_max+1, 6))
pl.suptitle("Time spent completing master's degree")
pl.xlabel("Months")
pl.ylabel("Students")

This doesn't tell us much, but we can already notice that for many of the specialisations, we have very little data available.

### Test 1
As for question 1, we can use the **Mann-Whitney rank test ** on each "general-specialisation" pair to compute the p-value. We store the result of each pairwise Mann-Whitney statistic in a dictionary. 

Once again, we're working with the null hypothesis $H_0$ that the mean in months spent for a master's degree of the two populations is equal.


In [None]:
mann_whitney_res = {}
general_m = df_m['master_months']
for name, value, in spec_grouped_master: 
    stat, pval = stats.mannwhitneyu(general_m, value)
    mann_whitney_res[name] = (stat, pval)

The resulting p-values of the pairwise tests is as follows:

In [None]:
for k, v in mann_whitney_res.items():
    print(k, "has p-value \t", v[1])

Let's filter out all the specialisations which had a **p-value of less than 0.10** when compared to the general distribution of students.

In [None]:
P_THRESHOLD = 0.10

filtered_mw_res = {k: v for k, v in mann_whitney_res.items() if v[1] >= P_THRESHOLD}
print("The following specialisations have p-value over", P_THRESHOLD)
print("====================================================")
for k, v in filtered_mw_res.items():
    print(k)

So for all of the above, we fail to reject the null hypothesis $H_0$, meaning that the difference in mean is not obviously statistically significant.

Let's do the same for the opposite and filter out those with a **p-value of more than 0.10**. 

In [None]:
filtered_mw_res = {k: v for k, v in mann_whitney_res.items() if v[1] < P_THRESHOLD}
print("The following specialisations have p-value below", P_THRESHOLD)
print("====================================================")
for k, v in filtered_mw_res.items():
    print(k)

So for the above specialisations, we reject $H_0$, i.e. the mean of number of months spent for a master's degree in general and for the given specialisation is **not equal**.

### Test 2

Let's run the K-S test on our population pairs again to check for inaccuracies in the Mann-Whitney rank test. 

In [None]:
ks_res = {}
for name, value, in spec_grouped_master: 
    stat, pval = stats.ks_2samp(general_m, value)
    ks_res[name] = (stat, pval)

As before, let's check the resulting p-values for this pairwise testing.

In [None]:
for k, v in ks_res.items():
    print(k, "has p-value \t", v[1])

We check which specialisations have a p-value over our defined threshold when compared to the general distribution, i.e. those where we fail to reject $H_0$.

In [None]:
filtered_ks_res = {k: v for k, v in ks_res.items() if v[1] >= P_THRESHOLD}
print("The following specialisations have p-value over", P_THRESHOLD)
print("====================================================")
for k, v in filtered_ks_res.items():
    print(k)

And then we check which specialisations have a p-value below our defined threshold when compared to the general distribution, i.e. those for which we reject $H_0$.

In [None]:
filtered_ks_res = {k: v for k, v in ks_res.items() if v[1] < P_THRESHOLD}
print("The following specialisations have p-value below", P_THRESHOLD)
print("====================================================")
for k, v in filtered_ks_res.items():
    print(k)

There are three specialisations which get different results for the two tests. *Foundations of Software*, *Computer Engineering - SP* and *Biocomputing*

# WHAT THE HELL DAMMIT