In [None]:
# Packages importing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10,6))
import seaborn as sns

In [None]:
def clean_mun(df_orig, obce) -> pd.DataFrame:
    """
    Clean and prepare a dataframe containing data from local elections. The cleaning includes:
        -[cele_jmeno]: Combining name columns into one
        -[vs_vzdelani]: Creating a boolean column checking whether a candidate has university degree
        -[POVOLANI]: canidate's profession
        -[pohlavi]: candidate's gender (based on their surname's ending)
        -[pohlavi_encoded]: encoded genders
        -[kraj]: region where candidate runs their campaign
        -[kraj_encoded]: encoded regions
        -[OSTRANA]: political party
        -[POCPROVSE]: percentage of votes at the election

    :param df_orig: dataframe containing data from local elections
    :param obce: a dictionary containing regions' codes
    :return: dataframe with cleaned and encoded data
    """
    df = df_orig.copy()
    df["cele_jmeno"] = df["JMENO"] + " " + df["PRIJMENI"]
    df["pohlavi"] = np.where(df.loc[:, "cele_jmeno"].str[-1] == "á", "F", "M")
    df["vs_vzdelani"] = np.where((df["TITULPRED"].notnull()) | (df["TITULZA"].notnull()), 1, 0)
    df["kraj"] = df["KODZASTUP"].map(obce)

    kraj_dict = dict(zip(df[df["kraj"].notna()]["kraj"].unique(), list(range(1, 15))))
    kraj_dict["nan"] = 0
    pohlavi_dict = dict(zip(df["pohlavi"].unique(), list(range(2))))

    df["kraj_encoded"] = df["kraj"].map(kraj_dict)
    df["pohlavi_encoded"] = df["pohlavi"].map(pohlavi_dict)

    columns = ["cele_jmeno", "VEK", "vs_vzdelani", "POVOLANI", "pohlavi","pohlavi_encoded", "kraj", "kraj_encoded", "OSTRANA", "POCPROCVSE"]

    return df[columns]

In [None]:
zaso_list = []
zaso_years = [2006, 2010, 2014, 2018, 2022]

for year in zaso_years:
    temp_df = pd.read_csv(f"https://raw.githubusercontent.com/rmnskb/cz-elections/main/municipal-elections/zaso_{year}.csv", sep=";", encoding="utf-8")
    zaso_list.append(temp_df)

zaso_list[0].head()

In [None]:
obce = pd.read_csv("https://raw.githubusercontent.com/rmnskb/cz-elections/main/municipal-elections/obce.csv",
                   sep=";", encoding="utf-8")
obce_dict = dict(zip(obce["CHODNOTA1"], obce["TEXT2"]))
obce.head()

In [None]:
zaso_clean_list = []

for i in range(5):
    temp_df = clean_mun(zaso_list[i], obce_dict)
    zaso_clean_list.append(temp_df)

zaso_clean_list[0].head()

In [None]:
for index, year in enumerate(zaso_years):
    zaso_clean_list[index]["year"] = year

zaso_clean_all_years = pd.concat(zaso_clean_list, ignore_index=True)
zaso_clean_all_years.tail(10)

In [None]:
g = sns.displot(
    data=zaso_clean_all_years, x="VEK", hue="pohlavi", kind="kde", fill=True, legend=True, col="year"
)
g.set_titles("Rozložení věků kanidátů do Zastupitelstev obcí\n podle pohlaví v roce {col_name}")
g.set(xlabel="Věk", ylabel="Hustota")
#g.add_legend(labels=["Ženy", "Muži"], title="Pohlaví", loc="center right")


years_avgs = {}
for i, year in enumerate(zaso_years):
    years_avgs[i] = [zaso_clean_all_years.query(f"year == {year}")["VEK"].mean(),
                     zaso_clean_all_years.query(f"year == {year}")["VEK"].median(),
                     zaso_clean_all_years.query(f"year == {year}")["VEK"].mode().iat[0]]

axes = g.axes.flatten()
for i, ax in enumerate(axes):
    ax.axvline(years_avgs[i][0], ls="--", lw=1.0, label=f"""Průměr: {years_avgs[i][0]:.2f}""")
    ax.axvline(years_avgs[i][1], color="darkred", lw=1.0, label=f"""Medián: {years_avgs[i][1]:.2f}""")
    ax.axvline(years_avgs[i][2], ls="-.", color = "orange", lw=1.0, label=f"""Modus: {years_avgs[i][2]:.2f}""")
    ax.legend(loc=0)

In [None]:
#zaso_clean_all_years["POVOLANI"].value_counts()[:100].index.tolist()