In [1]:
import os
import json
import numpy as np
import pandas as pd
import plotly.express as px
from scipy.stats import zscore

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import dash_bootstrap_components as dbc

In [2]:
with open('./geo/wojewodztwa-min.geojson', 'r', encoding="utf8") as json_file:
    geojson = json.load(json_file)
v_id = pd.DataFrame([v['properties'] for v in geojson['features']])
v_id['nazwa'] = v_id['nazwa'].str.upper()

In [3]:
df = pd.read_csv('./data/ceidg_data_classif_cleaned.csv')

In [4]:
df.sample(4)

Unnamed: 0.1,Unnamed: 0,RandomDate,MonthOfStartingOfTheBusiness,QuarterOfStartingOfTheBusiness,MainAddressVoivodeship,MainAddressCounty,MainAddressTERC,CorrespondenceAddressVoivodeship,CorrespondenceAddressCounty,CorrespondenceAddressTERC,...,ShareholderInOtherCompanies,PKDMainSection,PKDMainDivision,PKDMainGroup,PKDMainClass,NoOfUniquePKDSections,NoOfUniquePKDDivsions,NoOfUniquePKDGroups,NoOfUniquePKDClasses,Target
308888,308888,2018-01-11,September,3,ŚLĄSKIE,ŻYWIECKI,2417152.0,ŚLĄSKIE,ŻYWIECKI,2417152.0,...,False,Q,86.0,869.0,8690.0,3,3,3,4,False
702570,702570,2018-01-12,May,2,ŚLĄSKIE,GLIWICE,2466011.0,ŚLĄSKIE,GLIWICE,2466011.0,...,False,H,49.0,493.0,4932.0,1,1,1,1,False
29194,29194,2018-06-27,August,3,MAZOWIECKIE,WARSZAWA,1465011.0,MAZOWIECKIE,WARSZAWA,1465011.0,...,False,G,47.0,471.0,4719.0,2,4,7,9,False
2339370,2339370,2018-09-06,November,4,PODLASKIE,BIAŁOSTOCKI,2002052.0,PODLASKIE,BIAŁOSTOCKI,2002052.0,...,False,F,43.0,433.0,4331.0,1,1,3,11,True


In [5]:
data = df[['MainAddressVoivodeship', 'PKDMainSection']]

In [6]:
data[data['PKDMainSection'] == 'M'].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()

Unnamed: 0,MainAddressVoivodeship,size
0,DOLNOŚLĄSKIE,23281
1,KUJAWSKO-POMORSKIE,10904
2,LUBELSKIE,10328
3,LUBUSKIE,5303
4,MAZOWIECKIE,70494
5,MAŁOPOLSKIE,24967
6,OPOLSKIE,4875
7,PODKARPACKIE,10179
8,PODLASKIE,6224
9,POMORSKIE,19736


In [7]:
data = df[['MainAddressVoivodeship', 'MainAddressCounty', 'PKDMainSection']]
matrix = data.groupby(['MainAddressVoivodeship','PKDMainSection']).size().unstack(fill_value=0)
matrix_proportions = matrix.div(matrix.sum(axis=1), axis=0)
normalized = matrix_proportions.apply(zscore)
normalized['Max'] = normalized.idxmax(axis=1)
print(normalized['Max'])

normalized_absolute = matrix.apply(zscore)
# mało działalności z T, więc pomijamy, bo wywala w kosmos Z Score jak już coś jest
normalized_absolute['Max'] = normalized_absolute.iloc[:,:-1].idxmax(axis=1)
print(normalized_absolute['Max'])

MainAddressVoivodeship
DOLNOŚLĄSKIE           K
KUJAWSKO-POMORSKIE     D
LUBELSKIE              B
LUBUSKIE               S
MAZOWIECKIE            M
MAŁOPOLSKIE            R
OPOLSKIE               K
PODKARPACKIE           B
PODLASKIE              A
POMORSKIE              O
WARMIŃSKO-MAZURSKIE    Q
WIELKOPOLSKIE          N
ZACHODNIOPOMORSKIE     I
ŁÓDZKIE                G
ŚLĄSKIE                K
ŚWIĘTOKRZYSKIE         E
Name: Max, dtype: object
MainAddressVoivodeship
DOLNOŚLĄSKIE           F
KUJAWSKO-POMORSKIE     D
LUBELSKIE              B
LUBUSKIE               O
MAZOWIECKIE            J
MAŁOPOLSKIE            F
OPOLSKIE               O
PODKARPACKIE           B
PODLASKIE              A
POMORSKIE              O
WARMIŃSKO-MAZURSKIE    A
WIELKOPOLSKIE          A
ZACHODNIOPOMORSKIE     I
ŁÓDZKIE                B
ŚLĄSKIE                C
ŚWIĘTOKRZYSKIE         E
Name: Max, dtype: object


In [None]:
with open('./geo/wojewodztwa-min.geojson', 'r', encoding="utf8") as json_file:
    geojson_voivodeships = json.load(json_file)

with open('./geo/powiaty-min.geojson', 'r', encoding="utf8") as json_file:
    geojson_counties = json.load(json_file)

data = df[['MainAddressVoivodeship', 'MainAddressCounty', 'PKDMainSection']]
sections = sorted([x for x in data['PKDMainSection'].unique() if str(x) != 'nan'])
col_options = [dict(label=x, value=x) for x in sections]
dimensions = ["Sekcja"]

app = dash.Dash(
    __name__, external_stylesheets=["https://codepen.io/chriddyp/pen/bWLwgP.css", dbc.themes.BOOTSTRAP]
)

app.layout = html.Div(
    [
        html.Div(
            [
                html.P([ "Podział:", dcc.Dropdown(id="Podział", options=[{'label': 'Województwa', 'value': 'voivodeships'}, {'label': 'Powiaty', 'value': 'counties'}])])
            ],
            style={"width": "20%", "float": "left"},
        ),
        html.Div([], style={"width": "80%"}),
        html.Div(
            [
                html.P([d + ":", dcc.Dropdown(id=d, options=col_options)])
                for d in dimensions
            ],
            style={"width": "20%", "float": "left"},
        ),
        dcc.Graph(id="graph", style={"width": "80%", "display": "inline-block"}),
    ]
)


@app.callback(Output("graph", "figure"), [Input("Sekcja", "value"), Input("Podział", "value")])
def make_figure(section, division):
    if division is None or division == "voivodeships":
        if section is None:
            v_size = data.groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        else:
            v_size = data[data['PKDMainSection'] == section].groupby('MainAddressVoivodeship').size().to_frame('size').reset_index()
        geojson = geojson_voivodeships
        v_id = pd.DataFrame([v['properties'] for v in geojson['features']])
        v_id['nazwa'] = v_id['nazwa'].str.upper()
        map_data = pd.merge(v_size, v_id.set_index('nazwa'), left_on='MainAddressVoivodeship', right_index=True)
        fig = px.choropleth(map_data, geojson=geojson, color="size", locations="id", featureidkey="properties.id", projection="mercator", color_continuous_scale="peach", 
                    labels={'id': 'id województwa', 'size': 'liczba zarejestrowanych działalności'}, hover_name="MainAddressVoivodeship", hover_data=['size'],
                    height=800)
    else:
        if section is None:
            v_size = data.groupby('MainAddressCounty').size().to_frame('size').reset_index()
        else:
            v_size = data[data['PKDMainSection'] == section].groupby('MainAddressCounty').size().to_frame('size').reset_index()
        geojson = geojson_counties
        v_id = pd.DataFrame([v['properties'] for v in geojson_counties['features']])
        v_id['nazwa'] = v_id['nazwa'].str[7:]
        v_id['nazwa'] = v_id['nazwa'].str.upper()
        map_data = pd.merge(v_size, v_id.set_index('nazwa'), left_on='MainAddressCounty', right_index=True)
        fig = px.choropleth(map_data, geojson=geojson, color="size", locations="id", featureidkey="properties.id", projection="mercator", color_continuous_scale="peach", 
                            labels={'id': 'id powiatu', 'size': 'liczba zarejestrowanych działalności'}, hover_name="MainAddressCounty", hover_data=['size'],
                            height=800)
    fig.update_geos(fitbounds="locations", visible=False, lataxis_range=[50,60], lonaxis_range=[0, 30])
    return fig

app.run_server()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [11/May/2020 19:40:22] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/May/2020 19:40:22] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/May/2020 19:40:22] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/May/2020 19:40:22] "[37mGET /_favicon.ico?v=1.12.0 HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/May/2020 19:40:23] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
