In [None]:
# Installations
# !pip install matplotlib
# !pip install seaborn
# !pip install scikit-learn
# TODO: joins - concatenate all unchanged info throughout the years to one dataframe and leave all the others to the different dfs, merge those,
# see if the candidate changed their party throughout the years, merge it back based on names
# TODO: add years during df importing
# Feature engineering!
# wide df for clustering, long for classification (drop the duplicates?)

## Import knihoven a inicializace potřebných funkcí

In [None]:
# Packages importing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
warnings.catch_warnings(record=True)
warnings.simplefilter("ignore")
plt.rc("figure", figsize=(10,6))

In [None]:
import json
import urllib.request

url = "https://raw.githubusercontent.com/rmnskb/cz-elections/main/dict_occupations.json"
response = urllib.request.urlopen(url)
encoding = response.info().get_content_charset("utf-8")
occup_dict = json.loads(response.read().decode(encoding))

def obor(val: str) -> str:
    """
    Checks to what field does candidate's profession belongs to (such as finance, law, medicine etc.) with the help of keywords from global occup_dict dictionary
    :param val: string value containing candidate's profession description
    :return: string value with the particular field, returns "Jiné" if no matches were found
    """
    global occup_dict
    for key in occup_dict.keys():
        for i in occup_dict[key]["var"]:
            if i in val:
                return occup_dict[key]["short_name"]
    return "Jiné"

occup_dict.keys()

In [None]:
def mandat(val: str) -> int:
    """
    Map value of mandates. 0 stands for no mandate, 1 stands for mandate. The function exists because there different formattings: in some years it is A or N, in other years it's 0 or 1
    :param val: string value
    :return: integer value 0 or 1
    """
    if val == "A" or val == "1":
        return 1
    elif val == "N" or val == "0":
        return 0

In [None]:
def clean_data(data: pd.DataFrame, strany: dict) -> pd.DataFrame:
    """
    Clean and prepare a dataframe containing data from different types of elections for further visualisation and modelling purposes.
    The output is:
        -[cele_jmeno]: combining name columns into one
        -[vek]: candidate's age
        -[vs_vzdelani]: Creating a boolean column checking whether a candidate has university degree
        -[obor]: field of candidate's profession
        -[pohlavi]: candidate's gender (based on their surname's ending)
        -[kraj]: region where candidate runs their campaign
        -[NSTRANA]: political party that nominated the candidate
        -[ideologie]: canidate's ideology (left-wing, center, right-wing)
        -[procento]: relative measure of votes for the candidate
        -[mandat]: whether candidate got their mandate
        -[year]: year of elections
    :param data: pandas dataframe including elections data
    :param strany: dictionary data type with political parties' data
    :return: cleaned pandas dataframe
    """
    df = data.copy()

    df["cele_jmeno"] = df["JMENO"] + " " + df["PRIJMENI"]
    df["pohlavi"] = np.where(df.loc[:, "cele_jmeno"].str[-1] == "á", "F", "M")
    df["vs_vzdelani"] = np.where((df["TITULPRED"].notnull()) | (df["TITULZA"].notnull()), 1, 0)
    df["vek"] = df["VEK"]
    # Místo PSTRANA jsem vzal NSTRANA, jelikož u té první je hodně None
    # TODO: drop the rows with NA (only the year 2006 has 5% NA values, others are relatively good with it)
    kraj_list = ["Hlavní město Praha", "Středočeský kraj", "Jihočeský kraj", "Plzeňský kraj", "Karlovarský kraj",
                "Ústecký kraj", "Liberecký kraj", "Královéhradecký kraj", "Pardubický kraj", "Kraj Vysočina",
                "Jihomoravský kraj", "Olomoucký kraj", "Zlínský kraj", "Moravskoslezský kraj"]
    kraj_dict = dict(zip(list(range(1, 15)), kraj_list))
    df["ideologie"] = df["NSTRANA"].map(strany)
    df["kraj"] = df["VOLKRAJ"].map(kraj_dict)
    df["obor"] = df["POVOLANI"].apply(lambda row: obor(str(row)))
    df["mandat"] = df["MANDAT"].apply(lambda row: mandat(str(row)))
    df["procento"] = df["POCPROC"]
    columns = ["cele_jmeno", "vek", "vs_vzdelani", "obor", "pohlavi", "kraj", "NSTRANA", "ideologie", "procento", "mandat", "year"]

    return df[columns]

## Načtení dat

In [None]:
snem_list = []
snem_years = [2006, 2010, 2013, 2017, 2021]
for year in snem_years:
    temp_df = pd.read_csv(f"https://raw.githubusercontent.com/rmnskb/cz-elections/main/legislative-elections/snem_{year}.csv",
                          sep=";", encoding="utf-8")
    temp_df["year"] = year
    snem_list.append(temp_df)

snem_list[0].head(20)

In [None]:
strany = pd.read_csv("https://raw.githubusercontent.com/rmnskb/cz-elections/main/legislative-elections/strany.csv",
                    sep=";", encoding="utf-8")
strany_dict = dict(zip(strany["VSTRANA"], strany["Ideologie"]))
strany.head()

In [None]:
snem_clean_list = []

for i in range(5):
    temp_df = clean_data(snem_list[i], strany_dict)
    snem_clean_list.append(temp_df)

snem_clean_list[0].head(20)

In [None]:
for index, year in enumerate(snem_years):
    snem_clean_list[index]["year"] = year

snem_long = pd.concat(snem_clean_list, ignore_index=True)
snem_long.tail(10)

## Ošetření chybějících hodnot

In [None]:
print("Počet chybějících hodnot v každém sloupci:\n")
print(snem_long.isnull().sum())
print("-" * 40)
print("Nejčastější ideologie kandidátů:\n")
print(snem_long["ideologie"].value_counts())

In [None]:
# Missing values in "vek" will be deleted, since these are non-existent candidates
# All missing values in "ideologie" will be assigned to Pravice, since it's the most frequent value
# All missing columns in "procento" will be set to 0

snem_long = snem_long.dropna(subset=["vek"])
snem_long.ideologie = snem_long.ideologie.fillna("Pravice")
snem_long.procento = snem_long.procento.fillna(0)

print("Stav po ošetření chybějících hodnot:")
print("Počet chybějících hodnot v každém sloupci:\n")
print(snem_long.isnull().sum())
print("-" * 40)
print("Nejčastější ideologie kandidátů:\n")
print(snem_long["ideologie"].value_counts())

## Exploratorní analýza

In [None]:
snem_long.info()

In [None]:
g = sns.displot(
        data=snem_long, x="vek", hue="pohlavi", kind="kde", fill=True, legend=True, col="year", col_wrap=3
    )
g.set_titles("Rozložení věků kanidátů do Poslanecké sněmovny\n podle pohlaví v roce {col_name}")
g.set(xlabel="Věk", ylabel="Hustota")
sns.move_legend(g, "upper left", bbox_to_anchor=(.70, .47), title="Pohlaví", labels=["Ženy", "Muži"]);

years_avgs = {}
for i, year in enumerate(snem_years):
    years_avgs[i] = [snem_long.query(f"year == {year}")["vek"].mean(),
                     snem_long.query(f"year == {year}")["vek"].median(),
                     snem_long.query(f"year == {year}")["vek"].mode().iat[0]]

axes = g.axes.flatten()
for i, ax in enumerate(axes):
    ax.axvline(years_avgs[i][0], ls="--", lw=1.0, label=f"""Průměr: {years_avgs[i][0]:.2f}""")
    ax.axvline(years_avgs[i][1], color="darkred", lw=1.0, label=f"""Medián: {years_avgs[i][1]:.2f}""")
    ax.axvline(years_avgs[i][2], ls="-.", color="orange", lw=1.0, label=f"""Modus: {years_avgs[i][2]:.2f}""")
    ax.legend(loc=0)

In [None]:
# First plot
hue_order = ["Levice", "Střed", "Pravice"]
hue_palette = ["Red", "Green", "Blue"]

sns.countplot(snem_clean_list[4], y="kraj", hue="ideologie",
              hue_order=hue_order, palette=hue_palette)
# plt.xticks(rotation=90)
plt.title("Ideologie kandidátů do sněmovny podle krajů v roce 2021")
plt.xlabel("Počet")
plt.ylabel("Kraj")
plt.legend(title="Ideologie")
plt.show();

# Second plot
sns.countplot(snem_clean_list[4][snem_clean_list[4]["mandat"] == 1], y="kraj", hue="ideologie",
              hue_order=hue_order, palette=hue_palette)
# plt.xticks(rotation=90)
plt.title("Ideologie zvolených kandidátů do sněmovny podle krajů v roce 2021")
plt.xlabel("Počet")
plt.ylabel("Kraj")
plt.legend(title="Ideologie")
plt.show();

In [None]:
g = sns.countplot(
    data=snem_long, y="obor", hue="vs_vzdelani",
    order=snem_long.obor.value_counts().index
)
plt.title("Nejčastější povolání podle vysokoškolského vzdělání v rocích 2006 - 2021")
plt.xlabel("Počet")
plt.ylabel("Povolání")
plt.legend(title="Vysokoškolské vzdělání",
           labels=["Ne", "Ano"]);

In [None]:
for i in range(5):
    snem_list[i]["obor"] = snem_list[i]["POVOLANI"].apply(lambda row: obor(str(row)))

snem = pd.concat(snem_list, ignore_index=True)

In [None]:
# obsolete - occupations added
# print(snem[snem["obor"] == "Jiné"]["POVOLANI"].value_counts().index[:100].to_list())

## Feature engineering

In [None]:
# Dummifying categorical variables
snem_ml = snem_long.drop(["cele_jmeno", "NSTRANA", "procento", "year"], axis=1)
snem_ml = pd.get_dummies(data=snem_ml, columns=["vs_vzdelani", "obor", "pohlavi", "kraj", "ideologie"])
snem_ml.head()

In [None]:
sns.countplot(
    data=snem_ml, x="mandat"
);
# the data is heavily unbalanced