# Introduction

We analyze the data about access to education for European people. 
We load also the ISO country codes data, as auxiliary data.  
Use of Sankey diagrams allows us to show in a single graph the distribution of access to education level per type of disability, age, sex and countries.

# Analysis preparation

## Load packages

In [None]:
import pandas as pd
import os

## Load data

In [None]:
data_df = pd.read_csv("/kaggle/input/population-by-education-level-in-europe/population_by_education_level.csv")

# Data exploration

## Glimpse the data

In [None]:
data_df.info()

In [None]:
data_df.head()

## Sankey diagram


### Visualization function using Sankey diagram

In [None]:
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram',param={"height":1000}):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE', '#AF2346','#32CD32','#8B008B','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
       
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.25
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count'],
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        ),
        height=param["height"]
    )
       
    fig = dict(data=[data], layout=layout)
    return fig

In [None]:
data_df.head()

In [None]:
agg_df = data_df.groupby(['isced11', 'age', 'geography'])['value'].sum().reset_index()
agg_df.sort_values(["value"], inplace=True, ascending=False)
print(f"All combinations: {agg_df.shape[0]}")
agg_df.head(5)

Let's remove TOTAL as well as aggregated EU countries groups.

In [None]:
data_agg = agg_df.loc[agg_df.isced11 != "TOTAL"]
data_agg = data_agg.loc[data_agg.age != "TOTAL"]
data_agg = data_agg.loc[~data_agg.geography.isin(["EU15", "EU28", "EA19", "EU27_2020"])]

In [None]:
fig = genSankey(data_agg,cat_cols=['age', 'isced11', 'geography'],\
                value_cols='value',
                title='Access to education of European people: {Age Group -> Education Level -> Country}')
iplot(fig, validate=False)

Let's include also time in the diagram.

In [None]:
agg_df = data_df.groupby(['isced11', 'age', 'date', 'geography'])['value'].sum().reset_index()
agg_df.sort_values(["value"], inplace=True, ascending=False)
data_agg = agg_df.loc[agg_df.isced11 != "TOTAL"]
data_agg = data_agg.loc[data_agg.age != "TOTAL"]
data_agg = data_agg.loc[~data_agg.geography.isin(["EU15", "EU28", "EA19", "EU27_2020"])]
print(f"All combinations: {agg_df.shape[0]}")
agg_df.head(5)

In [None]:
fig = genSankey(data_agg,cat_cols=['geography', 'age', 'isced11', 'date'],\
                value_cols='value',
                title='Access to education of European people: { Country -> Age Group -> Education Level -> Years }')
iplot(fig, validate=False)