In [18]:
import os
import pandas as pd

data = pd.DataFrame()

for level in ["gemeinden", "kreise", "bundeslaender"]:
    # Read in the data from a dedicated csv file
    input = pd.read_csv(
        f"{os.getcwd()}/Quellen/bevoelkerung/2024-07-28_Zensus2022_bevoelkerung-{level}.csv",
        sep=";",
        names=[
            "regkey",
            "name",
            "PERSONEN",
            "ANZAHL",
            "population",
            "QUALITAET",
        ],
        index_col=0,
        usecols=["regkey", "name", "population"],
        engine="python",
        dtype={
            "regkey": "str",  # String because of leading zeros
            "name": "str",
            "population": "int64",
        },
        skiprows=4,
        skipfooter=4,
    )

    input["name"] = input["name"].str.lstrip()

    # The following two blocks remove duplicate rows with differing names before they "exist" in the same dataframe.
    # This code is quite ugly, granted, but ensures the "best" names (the official, longer ones) remain in the final dataframe.
    # Example: "Freie und Hansestadt Hamburg" vs. "Hamburg"
    if level == "kreise":
        for row in input.iterrows():
            long_key = f"{row[0]:0<12}"
            if long_key in data.index:
                data.drop(long_key, inplace=True)

    if level == "bundeslaender":
        for row in input.iterrows():
            long_key = f"{row[0]:0<5}"
            if long_key in data.index:
                input.drop(row[0], inplace=True)

    data = pd.concat([data, input])


def rearrange_name(s):
    # Check if the string contains a comma
    if "," not in s:
        return s

    # Split the string at the rightmost comma
    parts = s.rsplit(",", 1)

    # Strip leading and trailing whitespace from both parts
    parts = [part.strip().title() for part in parts]

    # Use an f-string to format the output
    return f"{parts[1]} {parts[0]}"


data["name"] = data["name"].apply(rearrange_name)

# Reshape all index values to valid 12-digit regkeys by adding trailing zeros where necessary
data.index = data.index.str.ljust(12, "0")
data = data.drop_duplicates()

data = data.sort_index()

In [19]:
# Save the processed data to a new csv file
data.to_csv(
    f"{os.getcwd()}/Quellen/bevoelkerung/Zensus2022_Bevoelkerungszahl_regkey.csv",
    sep=";",
)

In [20]:
# Read in the data from a dedicated csv file
test = pd.read_csv(
    f"{os.getcwd()}/Quellen/bevoelkerung/Zensus2022_Bevoelkerungszahl_regkey.csv",
    sep=";",
    index_col=0,
    dtype={
        "regkey": "str",  # String because of leading zeros
        "name": "str",
        "population": "int64",
    },
)

In [21]:
test.loc["096630000000"]

name          Kreisfreie Stadt Würzburg
population                       131316
Name: 096630000000, dtype: object