In [1]:
import os
import json
import numpy as np
import pandas as pd
import plotly.express as px
from scipy.stats import zscore

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import dash_bootstrap_components as dbc

In [2]:
with open('./geo/wojewodztwa-min.geojson', 'r', encoding="utf8") as json_file:
    geojson = json.load(json_file)
v_id = pd.DataFrame([v['properties'] for v in geojson['features']])
v_id['nazwa'] = v_id['nazwa'].str.upper()

In [3]:
df = pd.read_csv('./data/ceidg_data_classif_cleaned.csv')

In [4]:
df.sample(4)

Unnamed: 0.1,Unnamed: 0,RandomDate,MonthOfStartingOfTheBusiness,QuarterOfStartingOfTheBusiness,MainAddressVoivodeship,MainAddressCounty,MainAddressTERC,CorrespondenceAddressVoivodeship,CorrespondenceAddressCounty,CorrespondenceAddressTERC,...,ShareholderInOtherCompanies,PKDMainSection,PKDMainDivision,PKDMainGroup,PKDMainClass,NoOfUniquePKDSections,NoOfUniquePKDDivsions,NoOfUniquePKDGroups,NoOfUniquePKDClasses,Target
153592,153592,2017-11-10,July,3,MAZOWIECKIE,WARSZAWA,1465038.0,MAZOWIECKIE,WARSZAWA,1465088.0,...,False,M,70.0,702.0,7022.0,2,2,2,2,False
1976450,1976450,2018-06-28,October,4,DOLNOŚLĄSKIE,KŁODZKI,208021.0,DOLNOŚLĄSKIE,KŁODZKI,208021.0,...,False,S,95.0,952.0,9522.0,6,7,8,8,False
821446,821446,2018-02-04,February,1,ŚLĄSKIE,ZAWIERCIAŃSKI,2416065.0,ŚLĄSKIE,ZAWIERCIAŃSKI,2416065.0,...,False,G,46.0,467.0,4677.0,5,9,18,29,False
2000583,2000583,2018-04-08,March,1,DOLNOŚLĄSKIE,WAŁBRZYCH,265011.0,DOLNOŚLĄSKIE,WAŁBRZYCH,265011.0,...,False,Q,88.0,881.0,8810.0,3,5,7,8,False


In [5]:
data = df[['MainAddressVoivodeship', 'PKDMainSection']]

In [6]:
data[data['PKDMainSection'] == 'M'].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()

Unnamed: 0,MainAddressVoivodeship,size
0,DOLNOŚLĄSKIE,23281
1,KUJAWSKO-POMORSKIE,10904
2,LUBELSKIE,10328
3,LUBUSKIE,5303
4,MAZOWIECKIE,70494
5,MAŁOPOLSKIE,24967
6,OPOLSKIE,4875
7,PODKARPACKIE,10179
8,PODLASKIE,6224
9,POMORSKIE,19736


In [7]:
data = df[['MainAddressVoivodeship', 'MainAddressCounty', 'PKDMainSection']]
matrix = data.groupby(['MainAddressVoivodeship','PKDMainSection']).size().unstack(fill_value=0)
matrix_proportions = matrix.div(matrix.sum(axis=1), axis=0)
normalized = matrix_proportions.apply(zscore)
normalized['Max'] = normalized.idxmax(axis=1)
print(normalized['Max'])

normalized_absolute = matrix.apply(zscore)
# mało działalności z T, więc pomijamy, bo wywala w kosmos Z Score jak już coś jest
normalized_absolute['Max'] = normalized_absolute.iloc[:,:-1].idxmax(axis=1)
print(normalized_absolute['Max'])

MainAddressVoivodeship
DOLNOŚLĄSKIE           K
KUJAWSKO-POMORSKIE     D
LUBELSKIE              B
LUBUSKIE               S
MAZOWIECKIE            M
MAŁOPOLSKIE            R
OPOLSKIE               K
PODKARPACKIE           B
PODLASKIE              A
POMORSKIE              O
WARMIŃSKO-MAZURSKIE    Q
WIELKOPOLSKIE          N
ZACHODNIOPOMORSKIE     I
ŁÓDZKIE                G
ŚLĄSKIE                K
ŚWIĘTOKRZYSKIE         E
Name: Max, dtype: object
MainAddressVoivodeship
DOLNOŚLĄSKIE           F
KUJAWSKO-POMORSKIE     D
LUBELSKIE              B
LUBUSKIE               O
MAZOWIECKIE            J
MAŁOPOLSKIE            F
OPOLSKIE               O
PODKARPACKIE           B
PODLASKIE              A
POMORSKIE              O
WARMIŃSKO-MAZURSKIE    A
WIELKOPOLSKIE          A
ZACHODNIOPOMORSKIE     I
ŁÓDZKIE                B
ŚLĄSKIE                C
ŚWIĘTOKRZYSKIE         E
Name: Max, dtype: object


In [8]:
data = df[['MainAddressVoivodeship', 'MainAddressCounty', 'PKDMainSection', 'PKDMainDivision',
          'PKDMainGroup', 'PKDMainClass']]
possible_classification_combinations = data[['PKDMainSection', 'PKDMainDivision', 'PKDMainGroup', 'PKDMainClass']].drop_duplicates().dropna()
print(possible_classification_combinations)

        PKDMainSection  PKDMainDivision  PKDMainGroup  PKDMainClass
1                    M             71.0         711.0        7112.0
2                    C             14.0         141.0        1412.0
3                    G             46.0         461.0        4619.0
4                    C             11.0         110.0        1107.0
5                    L             68.0         682.0        6820.0
...                ...              ...           ...           ...
2013867              K             65.0         653.0        6530.0
2216634              C             11.0         110.0        1106.0
2221395              B              5.0          52.0         520.0
2227038              O             84.0         841.0        8413.0
2316879              O             84.0         841.0        8411.0

[588 rows x 4 columns]


In [None]:
with open('./geo/wojewodztwa-min.geojson', 'r', encoding="utf8") as json_file:
    geojson_voivodeships = json.load(json_file)

with open('./geo/powiaty-min.geojson', 'r', encoding="utf8") as json_file:
    geojson_counties = json.load(json_file)

data = df[['MainAddressVoivodeship', 'MainAddressCounty', 'PKDMainSection', 'PKDMainDivision',
          'PKDMainGroup', 'PKDMainClass']]
sections = sorted([x for x in data['PKDMainSection'].unique() if str(x) != 'nan'])
possible_classification_combinations = data[['PKDMainSection', 'PKDMainDivision', 'PKDMainGroup', 'PKDMainClass']].drop_duplicates().dropna()
section_options = [dict(label=x, value=x) for x in sections]

app = dash.Dash(
    __name__, external_stylesheets=[dbc.themes.BOOTSTRAP]
)


controls = dbc.Card([
        dbc.FormGroup(
            [ 
                dbc.Label("Podział"),
                dcc.Dropdown(id="Podział", value='voivodeships', options=[
                    {'label': 'Województwa', 'value': 'voivodeships'}, {'label': 'Powiaty', 'value': 'counties'}
                ])
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Sekcja"),
                dcc.Dropdown(id="section-dropdown", options=section_options)
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Dział"),
                dcc.Dropdown(id="division-dropdown")
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Grupa"),
                dcc.Dropdown(id="group-dropdown")
            ]
        ),
        dbc.FormGroup(
            [ 
                dbc.Label("Klasa"),
                dcc.Dropdown(id="class-dropdown")
            ]
        )
    ],
    body=True)


app.layout = dbc.Container(
    [
        html.H2("Charakterystyka przestrzenna działalności gospodarczej"),
        html.Hr(),
        dbc.Row(
            [
                dbc.Col(controls, md=3),
                dbc.Col(dcc.Graph(id="graph"), md=9),
            ],
            align="left"
        )
    ]
)


@app.callback(Output("graph", "figure"), [
        Input("section-dropdown", "value"),
        Input("division-dropdown", "value"),
        Input("group-dropdown", "value"),
        Input("class-dropdown", "value"),
        Input("Podział", "value")
    ])
def make_figure(section, division, group, _class, area_division):
    print(section)
    print(division)
    print(area_division)
    print(group)
    print(_class)
    if area_division is None or area_division == "voivodeships":
        if section is None:
            v_size = data.groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        elif division is None:
            v_size = data[data['PKDMainSection'] == section].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        elif group is None:
            v_size = data[data['PKDMainDivision'] == division].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        elif _class is None:
            v_size = data[data['PKDMainGroup'] == group].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        else:
            v_size = data[data['PKDMainClass'] == _class].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        geojson = geojson_voivodeships
        v_id = pd.DataFrame([v['properties'] for v in geojson['features']])
        v_id['nazwa'] = v_id['nazwa'].str.upper()
        map_data = pd.merge(v_size, v_id.set_index('nazwa'), left_on='MainAddressVoivodeship', right_index=True)
        fig = px.choropleth(map_data, geojson=geojson, color="size", locations="id", featureidkey="properties.id", projection="mercator", color_continuous_scale="peach", 
                    labels={'id': 'id województwa', 'size': 'liczba firm'}, hover_name="MainAddressVoivodeship", hover_data=['size'],
                           height=800)
    else:
        if section is None:
            v_size = data.groupby('MainAddressCounty').size().to_frame('size').reset_index()
        elif division is None:
            v_size = data[data['PKDMainSection'] == section].groupby('MainAddressCounty').size().to_frame('size').reset_index()
        elif group is None:
            v_size = data[data['PKDMainDivision'] == division].groupby('MainAddressCounty').size().to_frame('size').reset_index()
        elif _class is None:
            v_size = data[data['PKDMainGroup'] == group].groupby('MainAddressCounty').size().to_frame('size').reset_index()
        else:
            v_size = data[data['PKDMainClass'] == _class].groupby('MainAddressCounty').size().to_frame('size').reset_index()
        geojson = geojson_counties
        v_id = pd.DataFrame([v['properties'] for v in geojson_counties['features']])
        v_id['nazwa'] = v_id['nazwa'].str[7:]
        v_id['nazwa'] = v_id['nazwa'].str.upper()
        map_data = pd.merge(v_size, v_id.set_index('nazwa'), left_on='MainAddressCounty', right_index=True)
        fig = px.choropleth(map_data, geojson=geojson, color="size", locations="id", featureidkey="properties.id", projection="mercator", color_continuous_scale="peach", 
                            labels={'id': 'id powiatu', 'size': 'liczba firm'}, hover_name="MainAddressCounty", hover_data=['size'],
                           height=800)
    fig.update_geos(fitbounds="locations", visible=False, lataxis_range=[50,60], lonaxis_range=[0, 30])
    return fig

@app.callback(Output("division-dropdown", "options"), [Input("section-dropdown", "value")])
def get_division_options(section):
    divisions = sorted([x for x in possible_classification_combinations.loc[possible_classification_combinations['PKDMainSection'] == section]['PKDMainDivision'].unique()])
    division_options = [dict(label=x, value=x) for x in divisions]
    return division_options

@app.callback(Output("division-dropdown", "value"), [Input("section-dropdown", "value")])
def reset_division_value_on_section_change(section):
    return None

@app.callback(Output("group-dropdown", "options"), [Input("division-dropdown", "value")])
def get_group_options(division):
    groups = sorted([x for x in possible_classification_combinations.loc[possible_classification_combinations['PKDMainDivision'] == division]['PKDMainGroup'].unique()])
    group_options = [dict(label=x, value=x) for x in groups]
    return group_options

@app.callback(Output("group-dropdown", "value"), [Input("section-dropdown", "value"), Input("division-dropdown", "value")])
def reset_group_value_on_section_or_division_change(section, division):
    return None

@app.callback(Output("class-dropdown", "options"), [Input("group-dropdown", "value")])
def get_division_options(group):
    classes = sorted([x for x in possible_classification_combinations.loc[possible_classification_combinations['PKDMainGroup'] == group]['PKDMainClass'].unique()])
    class_options = [dict(label=x, value=x) for x in classes]
    return class_options

@app.callback(Output("class-dropdown", "value"), [
        Input("section-dropdown", "value"),
        Input("division-dropdown", "value"),
        Input("group-dropdown", "value")])
def reset_class_value_on_section_or_division_or_group_change(section, division, group):
    return None

app.run_server()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [12/May/2020 19:38:50] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:50] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:50] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:50] "[37mGET /_favicon.ico?v=1.12.0 HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:50] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:50] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:50] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:50] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:50] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:50] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


None
None
voivodeships
None
None


127.0.0.1 - - [12/May/2020 19:38:51] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


None
None
counties
None
None


127.0.0.1 - - [12/May/2020 19:38:53] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:55] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:55] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:55] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:55] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:55] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:55] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


B
None
counties
None
None


127.0.0.1 - - [12/May/2020 19:38:55] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:57] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:57] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:57] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:57] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


B
6
counties
None
None


127.0.0.1 - - [12/May/2020 19:38:57] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:59] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:38:59] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


B
6
counties
61
None


127.0.0.1 - - [12/May/2020 19:38:59] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


B
6
counties
61
610


127.0.0.1 - - [12/May/2020 19:39:02] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:04] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:04] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:04] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:04] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


B
5
counties
None
None


127.0.0.1 - - [12/May/2020 19:39:04] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:05] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:05] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


B
5
counties
52
None


127.0.0.1 - - [12/May/2020 19:39:06] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:17] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:17] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:17] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:17] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:17] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:17] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


O
None
counties
None
None


127.0.0.1 - - [12/May/2020 19:39:17] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:20] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:20] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:20] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:20] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


O
84
counties
None
None


127.0.0.1 - - [12/May/2020 19:39:20] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:22] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:22] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


O
84
counties
841
None


127.0.0.1 - - [12/May/2020 19:39:22] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:24] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:39:24] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


O
84
counties
842
None


127.0.0.1 - - [12/May/2020 19:39:24] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


O
84
counties
842
8425


127.0.0.1 - - [12/May/2020 19:39:26] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


O
84
counties
842
8422


127.0.0.1 - - [12/May/2020 19:39:29] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:08] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:08] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:08] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:08] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:08] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:08] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


L
None
counties
None
None


127.0.0.1 - - [12/May/2020 19:42:09] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:10] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:10] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:10] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:10] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


L
68
counties
None
None


127.0.0.1 - - [12/May/2020 19:42:10] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:11] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/May/2020 19:42:11] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


L
68
counties
682
None


127.0.0.1 - - [12/May/2020 19:42:11] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


L
68
counties
682
6820


127.0.0.1 - - [12/May/2020 19:42:13] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
