In [None]:
# imports
import os
import json
import glob
from pathlib import Path
import xml.etree.ElementTree as et
import regex
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime
import time
import zipfile
import io

# helper functions and constants
from dataGeneration.extract_contributions import extract
from dataGeneration.clean_text import clean_name_headers
from dataGeneration.match_names import insert_politician_id_into_contributions_extended
import paths as PATHS

# **Contributions**

## **5.1 Extended contributions - Normalize Speaker Names and Assign Party Affiliation**

Processes the contributions_extended files from stage 4 and prepares them for further use by:
- **Name Cleaning and Splitting:**
    - The name_raw field (raw PDF name extractions) is cleaned from unwanted characters.
    - Academic titles and nobility prefixes (e.g., Dr., von, Freiherr) are removed.
    - Names are then split into first_name and last_name.
    - Here the external logic clean_name_headers is used. It can be found in [clean_text.py](../clean_text.py).
- **Faction Matching:**
    - Party names are normalized using regex patterns.
    - The faction field is mapped to a canonical abbreviation.
    - Corresponding faction_id values are assigned from a precompiled faction lookup table.
- **Output Generation:**
    - At this point the fields first_name, last_name, acad_title, faction, and faction_id are properly cleaned and normalized.


### **Input:**
```
dataStage04/
├── contributionsExtended/
│   ├── electoral_term_19/
│   │   ├── 19001.pkl
│   │   └── …
│   ├── electoral_term_20/
│   │   ├── 20001.pkl
│   │   └── …
dataStage03/
├── dataFactionsStage03/
│   └── factionsAbbreviations.pkl
```

### **Ouput:**
```
dataStage05/
├── contributionsExtendedStage05/
│   ├── electoral_term_19/
│   │   ├── 19001.pkl
│   │   └── …
│   ├── electoral_term_20/
│   │   ├── 20001.pkl
│   │   └── …
```

In [None]:
# Disabling pandas warnings.
pd.options.mode.chained_assignment = None

# input directory
CONTRIBUTIONS_EXTENDED_INPUT = PATHS.CONTRIB_EXT_19.parent
CONTRIBUTIONS_EXTENDED_OUTPUT = PATHS.STAGE05 / "contributionsExtendedStage05"
factions = pd.read_pickle(PATHS.FACTIONS_ABBR_STAGE03)

# output directory
CONTRIBUTIONS_EXTENDED_OUTPUT.mkdir(parents=True, exist_ok=True)

faction_patterns = {
    "Bündnis 90/Die Grünen": r"(?:BÜNDNIS\s*(?:90)?/?(?:\s*D[1I]E)?|Bündnis\s*90/(?:\s*D[1I]E)?)?\s*[GC]R[UÜ].?\s*[ÑN]EN?(?:/Bündnis 90)?",  # noqa: E501
    "CDU/CSU": r"(?:Gast|-)?(?:\s*C\s*[DSMU]\s*S?[DU]\s*(?:\s*[/,':!.-]?)*\s*(?:\s*C+\s*[DSs]?\s*[UÙ]?\s*)?)(?:-?Hosp\.|-Gast|1)?",  # noqa: E501
    "BP": r"^\[?BP\]?",
    "DA": r"^\[?DA\]?",
    "DP": r"^\[?DP\]?",
    "DIE LINKE.": r"DIE ?LINKE|LINKEN|\[DIE ?LINKE.\]",
    "DPB": r"^\[?DPB\]?",
    "DRP": r"\[?DRP(\-Hosp\.)?\]?|^\[?SRP\]?|^\[?DBP\]?",
    "FDP": r"\s*F\.?\s*[PDO][.']?[DP]\.?",
    "Fraktionslos": r"(?:fraktionslos|Parteilos)",
    "FU": r"^\[?FU\]?",
    "FVP": r"^\[?FVP\]?",
    "Gast": r"\[?Gast\]?",
    "GB/BHE": r"\[?(?:GB[/-]\s*)?BHE(?:-DG)?\]?",
    "KPD": r"^\[?KPD\]?",
    "NR": r"^\[?NR\]?$",
    "PDS": r"(?:Gruppe\s*der\s*)?PDS(?:/(?:LL|Linke Liste))?",
    "SPD": r"\s*'?S(?:PD|DP)(?:\.|-Gast)?",
    "SSW": r"^\[?SSW\]?",
    "SRP": r"^\[?SRP\]?",
    "WAV": r"^\[?WAV\]?",
    "Z": r"^\[?Z\]?$",
    "AfD": r"^\[?AfD\]?$",
    "DBP": r"^\[?DBP\]?$",
}


def get_faction_abbrev(faction, faction_patterns):
    """matches the given faction and returns an id"""

    for faction_abbrev, faction_pattern in faction_patterns.items():
        if regex.search(faction_pattern, faction):
            return faction_abbrev
    return None


# iterate over all electoral_term_folders
for folder_path in sorted(CONTRIBUTIONS_EXTENDED_INPUT.iterdir()):
    if not folder_path.is_dir():
        continue

    save_path = CONTRIBUTIONS_EXTENDED_OUTPUT / folder_path.stem
    save_path.mkdir(parents=True, exist_ok=True)

    # iterate over every contributions_extended file
    for contrib_file in tqdm(folder_path.glob("*.pkl"), desc=f"Bearbeite {folder_path.stem}"):
        # read the spoken content csv
        contributions_extended = pd.read_pickle(contrib_file)

        # Insert acad_title column and extract plain name and titles.
        # ADD DOCUMENTATION HERE
        contributions_extended.insert(3, "faction_id", -1)
        contributions_extended.insert(5, "last_name", "")
        contributions_extended.insert(6, "first_name", "")
        contributions_extended.insert(7, "acad_title", "")

        # Current workaround, because some speeches seem to not be matched
        # correctly. If second stage works without mistakes, this should not be
        # necessary anymoregex.
        contributions_extended = contributions_extended.fillna("")

        # Clean all the names still remaining from PDF Header.
        # KEEP IN MIND THIS ALSO DELETES NAMES IN VOTING LISTS!!!
        # And I think not all names are cleaned because of their position, e.g.
        # "Max Mustermann, Bundeskanzler"
        # THIS PART IS IMPORTANT AND SHOULD WORK PROPERLY, AS REOCCURING NAMES
        # CAN INTRODUCE A LARGE BIAS IN TEXT ANALYSIS
        names = contributions_extended["name_raw"].to_list()
        contributions_extended["content"] = contributions_extended["content"].apply(
            clean_name_headers,
            args=(np.unique(names), True),
        )

        contributions_extended.reset_index(inplace=True, drop=True)

        # Delete all not alphabetical chars, keep "-" as it occurs often in
        # names.
        # Question: Is any other character deleted, which could be in a name?
        # Answer: I don't think so.
        contributions_extended["name_raw"] = contributions_extended["name_raw"].astype(str)
        contributions_extended["name_raw"] = contributions_extended["name_raw"].str.replace(
            r"[^a-zA-ZÖÄÜäöüß\-]", " ", regex=True
        )

        # Replace more than two whitespaces with one.
        contributions_extended["name_raw"] = contributions_extended["name_raw"].str.replace(
            r"  +", " ", regex=True
        )

        # Graf has to be checked again, as this is also a last_name.
        # Titles have to be added: Like e.c. or when mistakes occur like b.c.
        # Deleted "Graf" for now.
        titles = [
            "Dr",
            "Frau",
            "D",
            "-Ing",
            "von",
            "und",
            "zu",
            "van",
            "de",
            "Baron",
            "Freiherr",
            "Prinz",
            "h",
            "c",
        ]

        # Split the name_raw column into it's components at space character.
        first_last_titles = contributions_extended["name_raw"].apply(str.split)

        # Extract acad_title, if it is in the titles list.
        contributions_extended["acad_title"] = [
            [acad_title for acad_title in title_list if acad_title in titles]
            for title_list in first_last_titles
        ]

        # Remove titles from the first_last_name list.
        for politician_titles in first_last_titles:
            for acad_title in politician_titles[:]:
                if acad_title in titles:
                    politician_titles.remove(acad_title)

        # Get the first and last name based on the amount of elements.
        for index, first_last in enumerate(first_last_titles):
            if len(first_last) == 1:
                contributions_extended["first_name"].iloc[index] = []
                contributions_extended["last_name"].iloc[index] = first_last[0]
            # elif len(first_last) == 2:
            elif len(first_last) >= 2:
                contributions_extended["first_name"].iloc[index] = first_last[:-1]
                contributions_extended["last_name"].iloc[index] = first_last[-1]
            else:
                contributions_extended["first_name"].iloc[index] = []
                contributions_extended["last_name"].iloc[index] = ""

        # look for parties in the faction column and replace them with a
        # standardized faction name
        for index, faction in zip(
            contributions_extended.index, contributions_extended["faction"]
        ):
            if faction:
                faction_abbrev = get_faction_abbrev(
                    str(faction), faction_patterns=faction_patterns
                )

                if faction_abbrev:
                    contributions_extended.at[index, "faction"] = faction_abbrev
                    try:
                        contributions_extended.at[index, "faction_id"] = int(
                            factions.loc[factions["abbreviation"] == faction_abbrev, "id"].iloc[0]
                        )
                    except IndexError:
                        contributions_extended.at[index, "faction_id"] = -1

        contributions_extended.drop(columns=["name_raw"], inplace=True)
        contributions_extended.to_pickle(save_path / contrib_file.name)

print(f"contributions extended saved to {save_path}")

## **5.2 Assign Speaker Identity to extended Contributions**

Assigns a unique politician ID (ui) to each contribution entry by matching cleaned name components and metadata to the politician registry:
- Loads the cleaned contributions from dataStage05.
- Loads the complete list of politicians (including government and parliament members) from stage 3.
- Cleans and normalizes name columns in the politician database to facilitate accurate matching.
- Filters politicians per electoral term for performance and accuracy.
- For each file:
    - Matches speaker identity via first_name and last_name against the politician list.
    - Assigns the corresponding ui (unique ID) to the contribution.
    - This matching logic is encapsulated in insert_politician_id_into_contributions_extended() which can be found in [exctract_contributions.py](../extract_contributions.py)



### **Input:**
```
dataStage03/
├── dataPoliticiansStage03/
│   └── politicians.csv
dataStage05/
├── contributionsExtendedStage05/
│   ├── electoral_term_19/
│   │   ├── 19001.pkl
│   │   └── …
│   ├── electoral_term_20/
│   │   ├── 20001.pkl
│   │   └── …
```

### **Ouput:**
```
dataStage06/
├── contributionsExtendedStage06/
│   ├── electoral_term_19/
│   │   ├── 19001.pkl
│   │   └── …
│   ├── electoral_term_20/
│   │   ├── 20001.pkl
│   │   └── …
```

In [None]:
# input directory
CONTRIBUTIONS_EXTENDED_INPUT = PATHS.STAGE05 / "contributionsExtendedStage05"
DATA_FINAL = PATHS.SPEAKER_LOOKUP_STAGE03.parent  # dataStage03/dataPoliticiansStage03

# output directory
CONTRIBUTIONS_EXTENDED_OUTPUT = PATHS.STAGE06 / "contributionsExtendedStage06"
CONTRIBUTIONS_EXTENDED_OUTPUT.mkdir(parents=True, exist_ok=True)

# MDBS
politicians = pd.read_csv(PATHS.SPEAKER_LOOKUP_STAGE03.with_suffix(".csv"))
politicians = politicians.loc[
    :,
    [
        "ui",
        "electoral_term",
        "faction_id",
        "first_name",
        "last_name",
        "gender",
        "constituency",
        "institution_type",
    ],
].copy()

politicians = politicians.astype(dtype={"ui": "int64"})

# Some cleaning to make matching easier.
politicians["constituency"] = politicians["constituency"].fillna("")

politicians["first_name"] = politicians["first_name"].str.lower()
politicians["last_name"] = politicians["last_name"].str.lower()
politicians["constituency"] = politicians["constituency"].str.lower()

politicians["first_name"] = politicians["first_name"].str.replace("ß", "ss", regex=False)
politicians["last_name"] = politicians["last_name"].str.replace("ß", "ss", regex=False)

politicians["first_name"] = politicians["first_name"].apply(str.split)

# iterate over all electoral_term_folders __________________________________________________
for folder_path in sorted(CONTRIBUTIONS_EXTENDED_INPUT.iterdir()):
    if not folder_path.is_dir():
        continue

    term_number = regex.search(r"(?<=electoral_term_)\d{2}", folder_path.stem)
    if term_number is None:
        continue
    term_number = int(term_number.group(0))

    save_path = CONTRIBUTIONS_EXTENDED_OUTPUT / folder_path.stem
    save_path.mkdir(parents=True, exist_ok=True)

    # Only select politicians of the election period.
    politicians_electoral_term = politicians.loc[politicians["electoral_term"] == term_number]
    gov_members_electoral_term = politicians_electoral_term.loc[
        politicians_electoral_term["institution_type"] == "Regierungsmitglied"
    ]

    working = []
    # iterate over every contributions_extended file
    for contrib_ext_file_path in tqdm(
        sorted(folder_path.glob("*.pkl")),
        desc=f"Match contributions (term {term_number:>2})...",
    ):
        # read the contributions_extended pickle file
        contributions_extended = pd.read_pickle(contrib_ext_file_path)

        (
            contributions_extended_matched,
            problems,
        ) = insert_politician_id_into_contributions_extended(
            contributions_extended,
            politicians_electoral_term,
            gov_members_electoral_term,
        )

        contributions_extended.to_pickle(save_path / contrib_ext_file_path.name)

print (f"contributions extended saved to {save_path}")