In [1]:
import pandas as pd
import logging

from tekkieworden.config import config
from tekkieworden.processing.munge import *

# config
pd.set_option("display.max_columns", 100)

### MBO

In [2]:
mbo_d = pd.read_csv(str(config.PATH_TO_RAW_DATA) + '/mbo_gediplomeerden_2019.csv', sep=';')
mbo_d.columns = mbo_d.columns.str.lower().str.replace(" ", "_")
mbo_rename_d_dict = {'dipman2015': '2015_man_d',
 'dipvrw2015': '2015_vrouw_d',
 'diptotaal2015': '2015_tot_d',
 'dipman2016': '2016_man_d',
 'dipvrw2016': '2016_vrouw_d',
 'diptotaal2016': '2016_tot_d',
 'dipman2017': '2017_man_d',
 'dipvrw2017': '2017_vrouw_d',
 'diptotaal2017': '2017_tot_d',
 'dipman2018': '2018_man_d',
 'dipvrw2018': '2018_vrouw_d',
 'diptotaa2018l': '2018_tot_d',
 'dipman2019': '2019_man_d',
 'dipvrw2019': '2019_vrouw_d',
 'diptotaal2019': '2019_tot_d'}
mbo_d = mbo_d.rename(columns=mbo_rename_d_dict)

In [3]:
mbo = pd.read_csv(str(config.PATH_TO_RAW_DATA) + '/mbo_inscriptions_2019.csv', sep=';')
mbo.columns = mbo.columns.str.lower().str.replace(" ", "_")
mbo_rename_i_dict = {'man2015': '2015_man_i',
 'vrouw2015': '2015_vrouw_i',
 'totaal2015': '2015_tot_i',
 'man2016': '2016_man_i',
 'vrouw2016': '2016_vrouw_i',
 'totaal2016': '2016_tot_i',
 'man2017': '2017_man_i',
 'vrouw2017': '2017_vrouw_i',
 'totaal2017': '2017_tot_i',
 'man2018': '2018_man_i',
 'vrouw2018': '2018_vrouw_i',
 'totaal2018': '2018_tot_i',
 'man2019': '2019_man_i',
 'vrouw2019': '2019_vrouw_i',
 'totaal2019': '2019_tot_i'}
mbo = mbo.rename(columns=mbo_rename_i_dict)

In [4]:
def unstack_duo_mbo_files(input_df):
    agg_cols = input_df.filter(regex='\d', axis=1).columns.tolist()
    groupby_cols = ['brin_nummer', 'kwalificatie_code', 'kwalificatie_naam']
    agg = input_df.groupby(groupby_cols)[agg_cols].sum()
    logging.info(f"dataframe shape: {agg.shape}")
    return agg

In [5]:
mbo_i_agg = unstack_duo_mbo_files(input_df=mbo)
mbo_d_agg = unstack_duo_mbo_files(input_df=mbo_d)

INFO:root:dataframe shape: (14029, 15)
INFO:root:dataframe shape: (12557, 15)


In [6]:
from tekkieworden.processing.utilities import pandas_join_on_index

In [7]:
def pandas_join_on_index(left_df, right_df, how="inner"):

    logging.info(f"left_df shape: {left_df.shape}")
    logging.info(f"right_df shape: {right_df.shape}")

    logging.info(f"joining on left_indices: {[i for i in left_df.index.names]}")
    logging.info(f"joining on left_key: {[i for i in right_df.index.names]}")

    joined_df = pd.merge(
        left_df, right_df, how=how, left_index=True, right_index=True, indicator=True
    )

    logging.info(f"Join result\n\t{joined_df._merge.value_counts()}")
    logging.info(f"joined df: {joined_df.shape}")

    return joined_df

In [8]:
df = pandas_join_on_index(left_df=mbo_i_agg, right_df=mbo_d_agg, how='left')

INFO:root:left_df shape: (14029, 15)
INFO:root:right_df shape: (12557, 15)
INFO:root:joining on left_indices: ['brin_nummer', 'kwalificatie_code', 'kwalificatie_naam']
INFO:root:joining on left_key: ['brin_nummer', 'kwalificatie_code', 'kwalificatie_naam']
INFO:root:Join result
	both          11517
left_only      2512
right_only        0
Name: _merge, dtype: int64
INFO:root:joined df: (14029, 31)


In [9]:
df = df.reset_index()

In [11]:
df.head(2)

Unnamed: 0,brin_nummer,kwalificatie_code,kwalificatie_naam,2015_man_i,2015_vrouw_i,2015_tot_i,2016_man_i,2016_vrouw_i,2016_tot_i,2017_man_i,2017_vrouw_i,2017_tot_i,2018_man_i,2018_vrouw_i,2018_tot_i,2019_man_i,2019_vrouw_i,2019_tot_i,2015_man_d,2015_vrouw_d,2015_tot_d,2016_man_d,2016_vrouw_d,2016_tot_d,2017_man_d,2017_vrouw_d,2017_tot_d,2018_man_d,2018_vrouw_d,2018_tot_d,2019_man_d,2019_vrouw_d,2019_tot_d,_merge
0,00GT,22090.0,Podium- en evenemententechniek,6.0,2.0,8.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,left_only
1,00GT,22142.0,Verkoper,67.0,19.0,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,left_only


In [13]:
df = label_tech_studies(input_df=df, yaml_file='mbo_tech_labels.yml', label_col='kwalificatie_naam')

In [14]:
df.tech_label.value_counts()

no_tech    13078
tech         702
growth       146
design       103
Name: tech_label, dtype: int64