In [1]:
import pandas as pd
import openpyxl
import numpy as np

In [16]:
satzart_dict = {
    "10": "Land",
    "20": "Regierungsbezirk",
    "30": "Region",
    "40": "Kreis",
    "50": "Gemeindeverband",
    "60": "Gemeinde",
}

In [17]:
textkennzeichen_dict = {
    "41": "Kreisfreie Stadt",
    "42": "Stadtkreis",
    "43": "Kreis",
    "44": "Landkreis",
    "45": "Regionalverband",
    "50": "Verbandsfreie Gemeinde",
    "51": "Amt",
    "52": "Samtgemeinde",
    "53": "Verbandsgemeinde",
    "54": "Verwaltungsgemeinschaft",
    "55": "Kirchspielslandgemeinde",
    "56": "Verwaltungsverband",
    "58": "Erfüllende Gemeinde",
    "60": "Markt",
    "61": "Kreisfreie Stadt",
    "62": "Stadtkreis",
    "63": "Stadt",
    "64": "Kreisangehörige Gemeinde",
    "65": "gemeindefreies Gebiet-bewohnt",
    "66": "gemeindefreies Gebiet-unbewohnt",
    "67": "Große Kreisstadt",
}

In [18]:
col_names = [
    "satzart", "textkennzeichen", "ars_land", "ars_rb",
    "ars_kreis", "ars_vb", "ars_gemeinde", "gemeindename",
    "flaeche_km2", "bev_gesamt", "bev_maennl", "bev_weibl", 
    "bev_pro_km2", "plz", "longitude", "latitude", "reisegebiet_id",
    "reisegebiet_name", "urbanisierung_id", "urbanisierung_cat",
]

dtype_dict = dict(zip(col_names, [object for i in range(len(col_names))]))

In [23]:
# read the content
filepath = "../data/raw/GV100AD/AuszugGV4QAktuell.xlsx"
data = pd.read_excel(
    filepath, 
    sheet_name="Onlineprodukt_Gemeinden",
    skiprows=6,
    header=None,
    engine="openpyxl",
    usecols="A:T",
    names=col_names,
    dtype=dtype_dict,
)
# filter for the rows that contain a satzart value
data = data[data["satzart"].isin(list(satzart_dict.keys()))]

In [24]:
# prepare ars codes
data["_ars_rb"] = data['ars_land'] + data['ars_rb']
data["_ars_kreis"] = data['ars_land'] + data['ars_rb'] + data["ars_kreis"]
data["_ars_vb"] = data['ars_land'] + data['ars_rb'] + data["ars_kreis"] + data["ars_vb"]
data["_ars_gemeinde"] = data['ars_land'] + data['ars_rb'] + data["ars_kreis"] + data["ars_vb"] + data["ars_gemeinde"]
# convert to dict
land_dict = data[data.satzart == "10"].set_index("ars_land")["gemeindename"].to_dict()
rb_dict = data[data.satzart == "20"].set_index("_ars_rb")["gemeindename"].to_dict()
kreis_dict = data[data.satzart == "40"].set_index("_ars_kreis")["gemeindename"].to_dict()
vb_dict = data[data.satzart == "50"].set_index("_ars_vb")["gemeindename"].to_dict()
# map dicts on ars codes
data['land'] = data.ars_land.map(land_dict)
data['rb'] = data._ars_rb.map(rb_dict)
data['kreis'] = data._ars_kreis.map(kreis_dict)
data['vb'] = data._ars_vb.map(vb_dict)
data['kennzeichen'] = data.textkennzeichen.map(textkennzeichen_dict)

In [25]:
# filter for gemeinde level data and collection of cols
col_selection = [
    'land', 'rb', 'kreis', 'vb', 'gemeindename', '_ars_gemeinde' ,'kennzeichen',
    'flaeche_km2', 'bev_gesamt', 'bev_maennl', 'bev_weibl', 'bev_pro_km2', 
    'plz', 'longitude', 'latitude', 'reisegebiet_name', 'urbanisierung_cat',
]
data = data.dropna(subset=["ars_gemeinde"])[col_selection].reset_index(drop=True)

In [39]:
# drop the description of the gemeindename, vb, or kreis after the ','
data['kreis'] = [
    i[:i.find(',')] if type(i) == str and i.find(',') > 0 else i
    for i 
    in data['kreis']
]
data['vb'] = [
    i[:i.find(',')] if i.find(',') > 0 else i
    for i 
    in data['vb']
]
data['gemeindename'] = [
    i[:i.find(',')] if i.find(',') > 0 else i
    for i 
    in data['gemeindename']
]

In [40]:
data.head(10)

Unnamed: 0,land,rb,kreis,vb,gemeindename,_ars_gemeinde,kennzeichen,flaeche_km2,bev_gesamt,bev_maennl,bev_weibl,bev_pro_km2,plz,longitude,latitude,reisegebiet_name,urbanisierung_cat
0,Schleswig-Holstein,,Flensburg,Flensburg,Flensburg,10010000000,Kreisfreie Stadt,53.02,90164,44904,45260,1701,24937,943751,5478252,Ostsee,dicht besiedelt
1,Schleswig-Holstein,,Kiel,Kiel,Kiel,10020000000,Kreisfreie Stadt,118.65,246794,120198,126596,2080,24103,1013727,54321775,Ostsee,dicht besiedelt
2,Schleswig-Holstein,,Lübeck,Lübeck,Lübeck,10030000000,Kreisfreie Stadt,214.19,216530,104032,112498,1011,23552,10683932,53866269,Ostsee,dicht besiedelt
3,Schleswig-Holstein,,Neumünster,Neumünster,Neumünster,10040000000,Kreisfreie Stadt,71.66,80196,39723,40473,1119,24534,9988422,54069895,übrig. Schleswig-Holstein,dicht besiedelt
4,Schleswig-Holstein,,Dithmarschen,Brunsbüttel,Brunsbüttel,10510011011,Stadt,65.21,12380,6240,6140,190,25541,913735,53896932,Nordsee,mittlere Besiedlungsdichte
5,Schleswig-Holstein,,Dithmarschen,Heide,Heide,10510044044,Stadt,31.97,21852,10435,11417,684,25746,9091156,54193962,übrig. Schleswig-Holstein,mittlere Besiedlungsdichte
6,Schleswig-Holstein,,Dithmarschen,Burg-St. Michaelisdonn,Averlak,10515163003,Kreisangehörige Gemeinde,9.06,554,288,266,61,25715,9182663,53941154,übrig. Schleswig-Holstein,gering besiedelt
7,Schleswig-Holstein,,Dithmarschen,Burg-St. Michaelisdonn,Brickeln,10515163010,Kreisangehörige Gemeinde,6.07,212,114,98,35,25712,9221166,5400849,übrig. Schleswig-Holstein,gering besiedelt
8,Schleswig-Holstein,,Dithmarschen,Burg-St. Michaelisdonn,Buchholz,10515163012,Kreisangehörige Gemeinde,14.56,998,520,478,69,25712,9223774,53987132,übrig. Schleswig-Holstein,gering besiedelt
9,Schleswig-Holstein,,Dithmarschen,Burg-St. Michaelisdonn,Burg (Dithmarschen),10515163016,Kreisangehörige Gemeinde,11.25,4159,2033,2126,370,25712,926246,5399748,übrig. Schleswig-Holstein,gering besiedelt


In [42]:
# export dataframe to csv
filepath = '../data/processed/gv100ad.csv'
data.to_csv(filepath, index=False)