In [26]:
import os
import json
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import zscore

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import dash_bootstrap_components as dbc

In [27]:
with open('./geo/wojewodztwa-min.geojson', 'r', encoding="utf8") as json_file:
    geojson = json.load(json_file)
v_id = pd.DataFrame([v['properties'] for v in geojson['features']])
v_id['nazwa'] = v_id['nazwa'].str.upper()

In [28]:
df = pd.read_csv('./data/ceidg_data_classif_cleaned.csv', dtype={'MainAddressTERC' : str})

In [29]:
df.sample(4)

Unnamed: 0.1,Unnamed: 0,RandomDate,MonthOfStartingOfTheBusiness,QuarterOfStartingOfTheBusiness,MainAddressVoivodeship,MainAddressCounty,MainAddressTERC,CorrespondenceAddressVoivodeship,CorrespondenceAddressCounty,CorrespondenceAddressTERC,...,ShareholderInOtherCompanies,PKDMainSection,PKDMainDivision,PKDMainGroup,PKDMainClass,NoOfUniquePKDSections,NoOfUniquePKDDivsions,NoOfUniquePKDGroups,NoOfUniquePKDClasses,Target
186581,186581,2018-10-13,May,2,MAZOWIECKIE,GRODZISKI,1405,MAZOWIECKIE,GRODZISKI,1405045.0,...,False,G,47.0,478.0,4789.0,1,2,2,2,False
2253705,2253705,2018-08-17,January,1,KUJAWSKO-POMORSKIE,BYDGOSKI,403,KUJAWSKO-POMORSKIE,BYDGOSKI,403052.0,...,False,Q,86.0,862.0,8622.0,1,1,1,1,False
679737,679737,2018-06-29,February,1,ŚLĄSKIE,BYTOM,2462,ŚLĄSKIE,BYTOM,2462011.0,...,False,J,62.0,620.0,6201.0,3,4,5,5,True
258141,258141,2017-11-11,January,1,PODLASKIE,SIEMIATYCKI,2010,PODLASKIE,SIEMIATYCKI,2010042.0,...,False,A,2.0,24.0,240.0,1,1,1,1,False


In [30]:
data = df[['MainAddressVoivodeship', 'MainAddressCounty', 'PKDMainSection']]
matrix = data.groupby(['MainAddressVoivodeship','PKDMainSection']).size().unstack(fill_value=0)
matrix_proportions = matrix.div(matrix.sum(axis=1), axis=0)
normalized = matrix_proportions.apply(zscore)
normalized['Max'] = normalized.idxmax(axis=1)
print(normalized['Max'])

normalized_absolute = matrix.apply(zscore)
# mało działalności z T, więc pomijamy, bo wywala w kosmos Z Score jak już coś jest
normalized_absolute['Max'] = normalized_absolute.iloc[:,:-1].idxmax(axis=1)
print(normalized_absolute['Max'])

MainAddressVoivodeship
DOLNOŚLĄSKIE           K
KUJAWSKO-POMORSKIE     D
LUBELSKIE              B
LUBUSKIE               S
MAZOWIECKIE            M
MAŁOPOLSKIE            R
OPOLSKIE               K
PODKARPACKIE           B
PODLASKIE              A
POMORSKIE              O
WARMIŃSKO-MAZURSKIE    Q
WIELKOPOLSKIE          N
ZACHODNIOPOMORSKIE     I
ŁÓDZKIE                G
ŚLĄSKIE                K
ŚWIĘTOKRZYSKIE         E
Name: Max, dtype: object
MainAddressVoivodeship
DOLNOŚLĄSKIE           F
KUJAWSKO-POMORSKIE     D
LUBELSKIE              B
LUBUSKIE               O
MAZOWIECKIE            J
MAŁOPOLSKIE            F
OPOLSKIE               O
PODKARPACKIE           B
PODLASKIE              A
POMORSKIE              O
WARMIŃSKO-MAZURSKIE    A
WIELKOPOLSKIE          A
ZACHODNIOPOMORSKIE     I
ŁÓDZKIE                B
ŚLĄSKIE                C
ŚWIĘTOKRZYSKIE         E
Name: Max, dtype: object


In [31]:
sections = pd.read_csv('./data/section_list.csv', dtype=str)
sections['name'] = sections[['symbol', 'name']].apply('-'.join, axis=1)
sections = sections.sort_values(axis=0, by='symbol')

divisions = pd.read_csv('./data/division_list.csv', dtype=str)
divisions['name'] = divisions[['symbol', 'name']].apply('-'.join, axis=1)
divisions['symbol'] = divisions['symbol'].astype('float64')
divisions = divisions.sort_values(axis=0, by='symbol')

groups = pd.read_csv('./data/group_list.csv')
groups['symbol'] = groups['symbol'].astype(str)
groups['name'] = groups[['symbol', 'name']].apply('-'.join, axis=1)
groups['symbol'] = groups['symbol'].astype('float64')
groups = groups.sort_values(axis=0, by='symbol')

classes = pd.read_csv('./data/class_list.csv')
classes['symbol'] = classes['symbol'].astype(str)
classes['name'] = classes[['symbol', 'name']].apply('-'.join, axis=1)
classes['symbol'] = classes['symbol'].astype('float64')
classes = classes.sort_values(axis=0, by='symbol')

pop = pd.read_csv('./data/Population_list.csv', dtype={'CODE' : str, 'NAME' : str, 'Total' : 'float64'})[['CODE','NAME','TOTAL']]

terc_list = pd.read_csv('./data/TERC_list.csv', dtype=str)

Empty DataFrame
Columns: [WOJ, POW, NAZWA, CODE]
Index: []


In [60]:
with open('./geo/wojewodztwa-min.geojson', 'r', encoding="utf8") as json_file:
    geojson_voivodeships = json.load(json_file)

with open('./geo/powiaty-min.geojson', 'r', encoding="utf8") as json_file:
    geojson_counties = json.load(json_file)

data = df[['MainAddressVoivodeship', 'MainAddressCounty', 'MainAddressTERC', 'Sex', 'PKDMainSection', 'PKDMainDivision',
          'PKDMainGroup', 'PKDMainClass']]
data['MainVoivodeshipTERC'] = data['MainAddressTERC'].str.slice(start=0, stop=2)
data['MainCountyTERC'] = data['MainAddressTERC'].str.slice(start=0, stop=4)
data.drop(columns=['MainAddressTERC'])

possible_classification_combinations = data[['PKDMainSection', 'PKDMainDivision', 'PKDMainGroup', 'PKDMainClass']].drop_duplicates().dropna()

section_list = [dict(label=row['name'], value=row['symbol']) for i, row in sections.iterrows()]

app = dash.Dash(
    __name__, external_stylesheets=[dbc.themes.BOOTSTRAP]
)


controls = dbc.Card([
        dbc.FormGroup(
            [ 
                dbc.Label("Podział"),
                dcc.Dropdown(id="Podział", value='voivodeships', options=[
                    {'label': 'Województwa', 'value': 'voivodeships'}, {'label': 'Powiaty', 'value': 'counties'}
                ])
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Sekcja"),
                dcc.Dropdown(id="section-dropdown", options=section_list)
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Dział"),
                dcc.Dropdown(id="division-dropdown")
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Grupa"),
                dcc.Dropdown(id="group-dropdown")
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Klasa"),
                dcc.Dropdown(id="class-dropdown")
            ]
        ),
        dbc.FormGroup(
            [ 
                dcc.RadioItems(id='radio',
                    options=[
                        {'label': 'Całkowita liczba działalności', 'value': 'total'},
                        {'label': 'Liczba przypadająca na 1000 mieszkańców', 'value': 'per_capita'},
                    ],
                    value='total'
                )  
            ]
        )
    ],
    body=True)


app.layout = dbc.Container(
    [
        html.H2("Charakterystyka przestrzenna działalności gospodarczej"),
        html.Hr(style={"border" : ""}),
        dbc.Row(
            [
                dbc.Col(
                    [
                        controls,
                        dbc.Label("Liczba działalności z podziałem na płeć", id="chart-label", style={"padding-top" : "2%", "text-align" : "center"}),
                        dcc.Graph(id="chart", figure={'data' : [], 'layout' : {'margin': {'b': 0, 'l': 0, 'r': 0, 't': 0}}})
                    ],
                    md=3),
                #dbc.Col(controls, md=3),
                dbc.Col(dcc.Graph(id="graph"), md=9),
            ],
            align="left"
        )
        #dbc.Row(dcc.Graph(id='chart'), align="left")
    ],
    style={"max-width" : "90%", "margin-top" : "2%"}
)


@app.callback(Output("graph", "figure"), [
        Input("section-dropdown", "value"),
        Input("division-dropdown", "value"),
        Input("group-dropdown", "value"),
        Input("class-dropdown", "value"),
        Input("Podział", "value"),
        Input("radio", "value")
    ])
def make_figure(section, division, group, _class, area_division, radio):
    if area_division is None or area_division == "voivodeships":
        if section is None:
            v_size = data.groupby(['MainAddressVoivodeship', 'MainVoivodeshipTERC']).size().to_frame('size').reset_index()
        elif division is None:
            v_size = data[data['PKDMainSection'] == section].groupby(['MainAddressVoivodeship', 'MainVoivodeshipTERC']).size().to_frame('size').reset_index()
        elif group is None:
            v_size = data[data['PKDMainDivision'] == division].groupby(['MainAddressVoivodeship', 'MainVoivodeshipTERC']).size().to_frame('size').reset_index()
        elif _class is None:
            v_size = data[data['PKDMainGroup'] == group].groupby(['MainAddressVoivodeship', 'MainVoivodeshipTERC']).size().to_frame('size').reset_index()
        else:
            v_size = data[data['PKDMainClass'] == _class].groupby(['MainAddressVoivodeship', 'MainVoivodeshipTERC']).size().to_frame('size').reset_index()
        if(radio == 'per_capita'):
            v_size['size'] = 1e3*v_size['size']/v_size.join(pop.set_index('CODE'), on='MainVoivodeshipTERC')['TOTAL']
        geojson = geojson_voivodeships
        v_id = pd.DataFrame([v['properties'] for v in geojson['features']])
        v_id['nazwa'] = v_id['nazwa'].str.upper()
        map_data = pd.merge(v_size, v_id.set_index('nazwa'), how='right', left_on='MainAddressVoivodeship', right_index=True)
        map_data['size'].fillna(0, inplace=True)
        tt = 'liczba firm' if radio == 'total' else 'liczba firm na 1000 mieszkańców'
        fig = px.choropleth(map_data, geojson=geojson, color="size", locations="id", featureidkey="properties.id", projection="mercator", color_continuous_scale="peach", 
                    labels={'size': tt}, hover_name="MainAddressVoivodeship", hover_data=['MainVoivodeshipTERC'],
                           height=800)
    else:
        if section is None:
            v_size = data.groupby(['MainAddressCounty', 'MainCountyTERC']).size().to_frame('size').reset_index()
        elif division is None:
            v_size = data[data['PKDMainSection'] == section].groupby(['MainAddressCounty', 'MainCountyTERC']).size().to_frame('size').reset_index()
        elif group is None:
            v_size = data[data['PKDMainDivision'] == division].groupby(['MainAddressCounty', 'MainCountyTERC']).size().to_frame('size').reset_index()
        elif _class is None:
            v_size = data[data['PKDMainGroup'] == group].groupby(['MainAddressCounty', 'MainCountyTERC']).size().to_frame('size').reset_index()
        else:
            v_size = data[data['PKDMainClass'] == _class].groupby(['MainAddressCounty', 'MainCountyTERC']).size().to_frame('size').reset_index()
        if(radio == 'per_capita'):
            v_size['size'] = 1e3*v_size['size']/v_size.join(pop.set_index('CODE'), on='MainCountyTERC')['TOTAL']
        geojson = geojson_counties
        v_id = pd.DataFrame([v['properties'] for v in geojson_counties['features']])
        v_id['nazwa'] = v_id['nazwa'].str[7:]
        v_id['nazwa'] = v_id['nazwa'].str.upper()
        map_data = pd.merge(v_size, v_id.set_index('nazwa'), how='right', left_on='MainAddressCounty', right_index=True)
        map_data['size'].fillna(0, inplace=True)
        tt = 'liczba firm' if radio == 'total' else 'liczba firm na 1000 mieszkańców'
        fig = px.choropleth(map_data, geojson=geojson, color="size", locations="id", featureidkey="properties.id", projection="mercator", color_continuous_scale="peach", 
                            labels={'size': tt}, hover_name="MainAddressCounty", hover_data=['MainCountyTERC'],
                           height=800)
    fig.update_geos(fitbounds="locations", visible=False, lataxis_range=[50,60], lonaxis_range=[5, 30])
    return fig

@app.callback(Output('chart', 'figure'), [
        Input('graph', 'clickData'),
        Input("Podział", "value"),
        Input("section-dropdown", "value"),
        Input("division-dropdown", "value"),
        Input("group-dropdown", "value"),
        Input("class-dropdown", "value")
        ])
def printData(clickData, area_division, section, division, group, _class):
    if section is None:
        chart_data = data
    elif division is None:
        chart_data = data[(data['PKDMainSection'] == section)]
    elif group is None:
        chart_data = data[(data['PKDMainSection'] == section) & (data['PKDMainDivision'] == division)]
    elif _class is None:
        chart_data = data[(data['PKDMainSection'] == section) & (data['PKDMainDivision'] == division) & (data['PKDMainGroup'] == group)]
    else:
        chart_data = data[(data['PKDMainSection'] == section) & (data['PKDMainDivision'] == division) & (data['PKDMainGroup'] == group) & (data['PKDMainClass'] == _class)]

    if clickData is not None:
        terc = clickData['points'][0]['customdata'][0]
        name = terc_list[terc_list['CODE'] == terc]['NAZWA'].values[0]
        if area_division == 'counties':
            chart_data = chart_data[chart_data['MainCountyTERC'] == terc]
            title = "POW. " + name
        else:
            chart_data = chart_data[chart_data['MainVoivodeshipTERC'] == terc]
            title = "WOJ. " + name
    else:
        title = "POLSKA"

    chart_data = chart_data.groupby('Sex').size().to_frame('count').reset_index()

    fig = px.pie(chart_data, values='count', names='Sex', color='Sex', color_discrete_map={'M':'blue', 'F':'pink'}, title=title)
    return fig

@app.callback(Output("division-dropdown", "options"), [Input("section-dropdown", "value")])
def get_division_options(section):
    divisions_from_section = divisions[divisions['parent']==section]
    division_list = [dict(label=row['name'], value=row['symbol']) for i, row in divisions_from_section.iterrows()]
    return division_list

@app.callback(Output("division-dropdown", "value"), [Input("section-dropdown", "value")])
def reset_division_value_on_section_change(section):
    return None

@app.callback(Output("group-dropdown", "options"), [Input("division-dropdown", "value")])
def get_group_options(division):
    groups_from_section = groups[groups['parent']==division]
    group_list = [dict(label=row['name'], value=row['symbol']) for i, row in groups_from_section.iterrows()]
    return group_list

@app.callback(Output("group-dropdown", "value"), [Input("section-dropdown", "value"), Input("division-dropdown", "value")])
def reset_group_value_on_section_or_division_change(section, division):
    return None

@app.callback(Output("class-dropdown", "options"), [Input("group-dropdown", "value")])
def get_division_options(group):
    classes_from_section = classes[classes['parent']==group]
    class_list = [dict(label=row['name'], value=row['symbol']) for i, row in classes_from_section.iterrows()]
    return class_list

@app.callback(Output("class-dropdown", "value"), [
        Input("section-dropdown", "value"),
        Input("division-dropdown", "value"),
        Input("group-dropdown", "value")])
def reset_class_value_on_section_or_division_or_group_change(section, division, group):
    return None

@app.callback(Output('graph', 'clickData'), [Input('Podział', 'value')])
def reset_clickData_on_map_change(area_division):
    return None

app.run_server()

* Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off
 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [17/May/2020 20:16:18] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [17/May/2020 20:16:18] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [17/May/2020 20:16:18] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [17/May/2020 20:16:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [17/May/2020 20:16:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [17/May/2020 20:16:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [17/May/2020 20:16:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [17/May/2020 20:16:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [17/May/2020 20:16:19] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [17/