In [1]:
import os
import json
import numpy as np
import pandas as pd
import plotly.express as px
from scipy.stats import zscore

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import dash_bootstrap_components as dbc

In [3]:
with open('./geo/wojewodztwa-min.geojson', 'r', encoding="utf8") as json_file:
    geojson = json.load(json_file)
v_id = pd.DataFrame([v['properties'] for v in geojson['features']])
v_id['nazwa'] = v_id['nazwa'].str.upper()

In [4]:
df = pd.read_csv('./data/ceidg_data_classif_cleaned.csv')

In [6]:
df.sample(4)

Unnamed: 0.1,Unnamed: 0,RandomDate,MonthOfStartingOfTheBusiness,QuarterOfStartingOfTheBusiness,MainAddressVoivodeship,MainAddressCounty,MainAddressTERC,CorrespondenceAddressVoivodeship,CorrespondenceAddressCounty,CorrespondenceAddressTERC,...,ShareholderInOtherCompanies,PKDMainSection,PKDMainDivision,PKDMainGroup,PKDMainClass,NoOfUniquePKDSections,NoOfUniquePKDDivsions,NoOfUniquePKDGroups,NoOfUniquePKDClasses,Target
1997305,1997305,2018-09-18,April,2,DOLNOŚLĄSKIE,WAŁBRZYSKI,221055.0,DOLNOŚLĄSKIE,WAŁBRZYSKI,221055.0,...,False,I,55.0,552.0,5520.0,2,2,4,4,False
963492,963492,2018-05-10,February,1,MAŁOPOLSKIE,MYŚLENICKI,1209035.0,MAŁOPOLSKIE,MYŚLENICKI,1209035.0,...,False,Q,86.0,869.0,8690.0,1,1,1,1,True
726538,726538,2018-05-17,March,1,ŚLĄSKIE,KATOWICE,2469011.0,ŚLĄSKIE,KATOWICE,2469011.0,...,False,G,47.0,471.0,4711.0,1,1,1,1,True
578620,578620,2018-10-18,May,2,LUBUSKIE,GORZÓW WIELKOPOLSKI,861011.0,LUBUSKIE,GORZÓW WIELKOPOLSKI,861011.0,...,False,F,43.0,432.0,4322.0,2,2,2,4,False


In [7]:
data = df[['MainAddressVoivodeship', 'PKDMainSection']]

In [8]:
data[data['PKDMainSection'] == 'M'].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()

Unnamed: 0,MainAddressVoivodeship,size
0,DOLNOŚLĄSKIE,23281
1,KUJAWSKO-POMORSKIE,10904
2,LUBELSKIE,10328
3,LUBUSKIE,5303
4,MAZOWIECKIE,70494
5,MAŁOPOLSKIE,24967
6,OPOLSKIE,4875
7,PODKARPACKIE,10179
8,PODLASKIE,6224
9,POMORSKIE,19736


In [10]:
data = df[['MainAddressVoivodeship', 'MainAddressCounty', 'PKDMainSection']]
matrix = data.groupby(['MainAddressVoivodeship','PKDMainSection']).size().unstack(fill_value=0)
matrix_proportions = matrix.div(matrix.sum(axis=1), axis=0)
normalized = matrix_proportions.apply(zscore)
normalized['Max'] = normalized.idxmax(axis=1)
print(normalized['Max'])

normalized_absolute = matrix.apply(zscore)
# mało działalności z T, więc pomijamy, bo wywala w kosmos Z Score jak już coś jest
normalized_absolute['Max'] = normalized_absolute.iloc[:,:-1].idxmax(axis=1)
print(normalized_absolute['Max'])

MainAddressVoivodeship
DOLNOŚLĄSKIE           K
KUJAWSKO-POMORSKIE     D
LUBELSKIE              B
LUBUSKIE               S
MAZOWIECKIE            M
MAŁOPOLSKIE            R
OPOLSKIE               K
PODKARPACKIE           B
PODLASKIE              A
POMORSKIE              O
WARMIŃSKO-MAZURSKIE    Q
WIELKOPOLSKIE          N
ZACHODNIOPOMORSKIE     I
ŁÓDZKIE                G
ŚLĄSKIE                K
ŚWIĘTOKRZYSKIE         E
Name: Max, dtype: object
MainAddressVoivodeship
DOLNOŚLĄSKIE           F
KUJAWSKO-POMORSKIE     D
LUBELSKIE              B
LUBUSKIE               O
MAZOWIECKIE            J
MAŁOPOLSKIE            F
OPOLSKIE               O
PODKARPACKIE           B
PODLASKIE              A
POMORSKIE              O
WARMIŃSKO-MAZURSKIE    A
WIELKOPOLSKIE          A
ZACHODNIOPOMORSKIE     I
ŁÓDZKIE                B
ŚLĄSKIE                C
ŚWIĘTOKRZYSKIE         E
Name: Max, dtype: object


In [17]:
print(data.dtypes)

MainAddressVoivodeship     object
MainAddressCounty          object
PKDMainSection             object
PKDMainDivision           float64
PKDMainGroup              float64
PKDMainClass              float64
dtype: object


In [26]:
sections = pd.read_csv('./data/section_list.csv', dtype=str)
sections['name'] = sections[['symbol', 'name']].apply('-'.join, axis=1)
sections = sections.sort_values(axis=0, by='symbol')

divisions = pd.read_csv('./data/division_list.csv', dtype=str)
divisions['name'] = divisions[['symbol', 'name']].apply('-'.join, axis=1)
divisions['symbol'] = divisions['symbol'].astype('float64')
divisions = divisions.sort_values(axis=0, by='symbol')

groups = pd.read_csv('./data/group_list.csv')
groups['symbol'] = groups['symbol'].astype(str)
groups['name'] = groups[['symbol', 'name']].apply('-'.join, axis=1)
groups['symbol'] = groups['symbol'].astype('float64')
groups = groups.sort_values(axis=0, by='symbol')

classes = pd.read_csv('./data/class_list.csv')
classes['symbol'] = classes['symbol'].astype(str)
classes['name'] = classes[['symbol', 'name']].apply('-'.join, axis=1)
classes['symbol'] = classes['symbol'].astype('float64')
classes = classes.sort_values(axis=0, by='symbol')

In [27]:
with open('./geo/wojewodztwa-min.geojson', 'r', encoding="utf8") as json_file:
    geojson_voivodeships = json.load(json_file)

with open('./geo/powiaty-min.geojson', 'r', encoding="utf8") as json_file:
    geojson_counties = json.load(json_file)

data = df[['MainAddressVoivodeship', 'MainAddressCounty', 'PKDMainSection', 'PKDMainDivision',
          'PKDMainGroup', 'PKDMainClass']]
possible_classification_combinations = data[['PKDMainSection', 'PKDMainDivision', 'PKDMainGroup', 'PKDMainClass']].drop_duplicates().dropna()

section_list = [dict(label=row['name'], value=row['symbol']) for i, row in sections.iterrows()]

app = dash.Dash(
    __name__, external_stylesheets=[dbc.themes.BOOTSTRAP]
)


controls = dbc.Card([
        dbc.FormGroup(
            [ 
                dbc.Label("Podział"),
                dcc.Dropdown(id="Podział", value='voivodeships', options=[
                    {'label': 'Województwa', 'value': 'voivodeships'}, {'label': 'Powiaty', 'value': 'counties'}
                ])
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Sekcja"),
                dcc.Dropdown(id="section-dropdown", options=section_list)
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Dział"),
                dcc.Dropdown(id="division-dropdown")
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Grupa"),
                dcc.Dropdown(id="group-dropdown")
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Klasa"),
                dcc.Dropdown(id="class-dropdown")
            ]
        )
    ],
    body=True)


app.layout = dbc.Container(
    [
        html.H2("Charakterystyka przestrzenna działalności gospodarczej"),
        html.Hr(),
        dbc.Row(
            [
                dbc.Col(controls, md=3),
                dbc.Col(dcc.Graph(id="graph"), md=9),
            ],
            align="left"
        )
    ]
)


@app.callback(Output("graph", "figure"), [
        Input("section-dropdown", "value"),
        Input("division-dropdown", "value"),
        Input("group-dropdown", "value"),
        Input("class-dropdown", "value"),
        Input("Podział", "value")
    ])
def make_figure(section, division, group, _class, area_division):
    print(section)
    print(division)
    print(area_division)
    print(group)
    print(_class)
    if area_division is None or area_division == "voivodeships":
        if section is None:
            v_size = data.groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        elif division is None:
            v_size = data[data['PKDMainSection'] == section].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        elif group is None:
            v_size = data[data['PKDMainDivision'] == division].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        elif _class is None:
            v_size = data[data['PKDMainGroup'] == group].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        else:
            v_size = data[data['PKDMainClass'] == _class].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        geojson = geojson_voivodeships
        v_id = pd.DataFrame([v['properties'] for v in geojson['features']])
        v_id['nazwa'] = v_id['nazwa'].str.upper()
        map_data = pd.merge(v_size, v_id.set_index('nazwa'), how='right', left_on='MainAddressVoivodeship', right_index=True)
        map_data['size'].fillna(0, inplace=True)
        fig = px.choropleth(map_data, geojson=geojson, color="size", locations="id", featureidkey="properties.id", projection="mercator", color_continuous_scale="peach", 
                    labels={'id': 'id województwa', 'size': 'liczba firm'}, hover_name="MainAddressVoivodeship", hover_data=['size'],
                           height=800)
    else:
        if section is None:
            v_size = data.groupby('MainAddressCounty').size().to_frame('size').reset_index()
        elif division is None:
            v_size = data[data['PKDMainSection'] == section].groupby('MainAddressCounty').size().to_frame('size').reset_index()
        elif group is None:
            v_size = data[data['PKDMainDivision'] == division].groupby('MainAddressCounty').size().to_frame('size').reset_index()
        elif _class is None:
            v_size = data[data['PKDMainGroup'] == group].groupby('MainAddressCounty').size().to_frame('size').reset_index()
        else:
            v_size = data[data['PKDMainClass'] == _class].groupby('MainAddressCounty').size().to_frame('size').reset_index()
        geojson = geojson_counties
        v_id = pd.DataFrame([v['properties'] for v in geojson_counties['features']])
        v_id['nazwa'] = v_id['nazwa'].str[7:]
        v_id['nazwa'] = v_id['nazwa'].str.upper()
        map_data = pd.merge(v_size, v_id.set_index('nazwa'), how='right', left_on='MainAddressCounty', right_index=True)
        map_data['size'].fillna(0, inplace=True)
        fig = px.choropleth(map_data, geojson=geojson, color="size", locations="id", featureidkey="properties.id", projection="mercator", color_continuous_scale="peach", 
                            labels={'id': 'id powiatu', 'size': 'liczba firm'}, hover_name="MainAddressCounty", hover_data=['size'],
                           height=800)
    fig.update_geos(fitbounds="locations", visible=False, lataxis_range=[50,60], lonaxis_range=[0, 30])
    return fig

@app.callback(Output("division-dropdown", "options"), [Input("section-dropdown", "value")])
def get_division_options(section):
    divisions_from_section = divisions[divisions['parent']==section]
    division_list = [dict(label=row['name'], value=row['symbol']) for i, row in divisions_from_section.iterrows()]
    return division_list

@app.callback(Output("division-dropdown", "value"), [Input("section-dropdown", "value")])
def reset_division_value_on_section_change(section):
    return None

@app.callback(Output("group-dropdown", "options"), [Input("division-dropdown", "value")])
def get_group_options(division):
    groups_from_section = groups[groups['parent']==division]
    group_list = [dict(label=row['name'], value=row['symbol']) for i, row in groups_from_section.iterrows()]
    return group_list

@app.callback(Output("group-dropdown", "value"), [Input("section-dropdown", "value"), Input("division-dropdown", "value")])
def reset_group_value_on_section_or_division_change(section, division):
    return None

@app.callback(Output("class-dropdown", "options"), [Input("group-dropdown", "value")])
def get_division_options(group):
    classes_from_section = classes[classes['parent']==group]
    class_list = [dict(label=row['name'], value=row['symbol']) for i, row in classes_from_section.iterrows()]
    return class_list

@app.callback(Output("class-dropdown", "value"), [
        Input("section-dropdown", "value"),
        Input("division-dropdown", "value"),
        Input("group-dropdown", "value")])
def reset_class_value_on_section_or_division_or_group_change(section, division, group):
    return None

app.run_server()

* Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off
 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [13/May/2020 11:48:17] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 11:48:18] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 11:48:18] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 11:48:18] "[37mGET /_favicon.ico?v=1.12.0 HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 11:48:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 11:48:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 11:48:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 11:48:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 11:48:18] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/Ma