# Imports

In [None]:

from pathlib import Path


## Data pre-processing
to make the statistical surveys easier,
and since we will be comparing the 19 with the 20 Bundestag afterwards, we will make a few adjustments to the data sets:

- the merging of BSW and the Left. The BSW split from the Linke at the end of 2023, and would therefore make it more difficult to compare the two electoral periods - this is possible because there is still a strong connection between the two parliamentary groups (see [`ML-Task-1_BSW_Confusion_Experiment`](../ML-Task-1_Classification/ML-Task-1_BSW_Confusion_Experiment.ipynb))
- The removal of non-attached MPs: this is important because guests often also speak in the Bundestag, but these speeches cannot be assigned to a parliamentary group and thus distort the result
- Removal of presidium/neutral moderation
as this could also distort the results as the parliamentary groups which, for example, provide the day's leadership have a significantly higher speaking share

In [None]:
from Preprocessing_Pipeline import execute_parallel_preprocessing
from Preprocessing_Pipeline import ensure_required_nlp_resources

dataset_configs = {}

terms = [19, 20, "19_20"]

for term in terms:
    dataset_configs_curr_term = {
        f"data_set_12, term: {term}": {
            "input_path":Path(f"../..data/dataFinalStage/speechContentFinalStage/speech_content_{term}.pkl"),
            "output_path_pickle": Path(f"../../data/newData/speech_content_{term}.pkl"),
            "output_path_excel": Path(f"../../data/newData/speech_content_{term}.xlsx"),


            # Optional filters (deactivated here)
            "position_short": ["Presidium of Parliament"],  # removes presidium/neutral moderation
            "only_valid_faction_id": True,
            "without_faction": 18, # loose Fraktionslos
            "change_faction": ["3", "7"], # swap BSW to Die Linke
            "log_prefix": f"[data_set_1, Term {term}]"
        }
    }
    dataset_configs.update(dataset_configs_curr_term)



In [None]:
if __name__ == "__main__":
    ensure_required_nlp_resources()
    execute_parallel_preprocessing(dataset_configs)