# Introduction

We analyze the access to education for disabled people data. 
We load also the ISO country codes data, as auxiliary data.  
Use of Sankey diagrams allows us to show in a single graph the distribution of access to education level per type of disability, age, sex and countries.

# Analysis preparation

## Load packages

In [None]:
import pandas as pd
import os

## Load data

In [None]:
data_df = pd.read_csv("/kaggle/input/access-to-education-of-disabled-people-in-europe/education_disbled_eu.csv")
country_codes_df = pd.read_csv("/kaggle/input/iso-country-codes-global/wikipedia-iso-country-codes.csv")

# Data exploration

## Glimpse the data

In [None]:
data_df.info()

In [None]:
country_codes_df.info()

In [None]:
data_df.head()

In [None]:
country_codes_df.head()

The country codes used in the asylum dataset correspond to the Alpha-2 codes in the ISO country code data. We will merge twice the two datasets to get as well the English short name countries names.

## Merge accidents data and country data

In [None]:
cc_df = country_codes_df[['English short name lower case','Alpha-2 code','Alpha-3 code']]
cc_df.columns = ['geo_name', 'geo', 'geo_3']
data_c_df = data_df.merge(cc_df, how='left')
print(data_df.shape, data_c_df.shape)
data_c_df.head()

## Sankey diagram


### Visualization function using Sankey diagram

In [None]:
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram',param={"height":1000}):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE', '#AF2346','#32CD32','#8B008B','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
       
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.25
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count'],
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        ),
        height=param["height"]
    )
       
    fig = dict(data=[data], layout=layout)
    return fig

### 

In [None]:
agg_df = data_c_df.groupby(['isced97', 'hlth_pb', 'sex', 'age', 'geo'])['value'].sum().reset_index()
agg_df.sort_values(["value"], inplace=True, ascending=False)
print(f"All combinations: {agg_df.shape[0]}")
agg_df.head(10)

### Sankey diagram with type of accidents and country

Let's remove TOTAL (for isced97, hlth_pb and age) and T (for sex).

In [None]:
data_agg = agg_df.loc[agg_df.isced97 != "TOTAL"]
data_agg = data_agg.loc[data_agg.hlth_pb != "TOTAL"]
data_agg = data_agg.loc[data_agg.sex != "T"]
data_agg = data_agg.loc[data_agg.age != "TOTAL"]

In [None]:
fig = genSankey(data_agg,cat_cols=['hlth_pb', 'sex', 'age','isced97', 'geo'],\
                value_cols='value',
                title='Access to education of disabled people: {Health Problem  -> Sex -> Age Group -> Education Level -> Country}')
iplot(fig, validate=False)

In [None]:
fig = genSankey(data_agg,cat_cols=['hlth_pb', 'age','isced97'],\
                value_cols='value',
                title='Access to education of disabled people: {Health Problem  -> Age Group -> Education Level}')
iplot(fig, validate=False)

In [None]:
fig = genSankey(data_agg,cat_cols=['hlth_pb', 'sex', 'isced97'],\
                value_cols='value',
                title='Access to education of disabled people: {Health Problem  -> Sex -> Education Level}')
iplot(fig, validate=False)

In [None]:
fig = genSankey(data_agg,cat_cols=['hlth_pb', 'geo', 'sex', 'age','isced97'],\
                value_cols='value',
                title='Access to education of disabled people: {Health Problem  -> Country -> Sex -> Age Group -> Education Level}')
iplot(fig, validate=False)