In [None]:
# imports
import os
import json
import glob
from pathlib import Path
import xml.etree.ElementTree as et
import regex
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime
import time
import zipfile
import io

# helper functions and constants
from dataGeneration.extract_contributions import extract
from dataGeneration.clean_text import clean_name_headers
from dataGeneration.match_names import insert_politician_id_into_contributions_extended
import paths as PATHS

# **4 Spoken Content**

## **4.1 Extract structured Speeches and Contributions from XML (Term 19 and 20)**

**1. Speech Extraction:** Each session_content.xml file is parsed to extract speeches (rede), including speaker metadata including politician_id (from MDB data or speaker_faction_lookup) and faction_id (determined from fraktion tag or fallback lookup)
- Handling of moderative speeches: The Bundespräsident:in, Vizepräsident:innen and Schriftführer:innen moderate sessions in the Bundestag. Their speeches are labeled with "Presidium of Parliament" in position_short. They have mostly short and repetitive speech-contributions which can easily be removed later
- Handling of speeches of ministers: In the Bundestag ministers officially speak as representatives of the current Bundesregierung and therefore their speeches do not have a "fraktion" tag. We work around that by using the politician-faction-lookup table to assign a faction_id to them

**2. Contribution Extraction:** Embedded commentary tags (kommentar) in the speech content are parsed using a regex-based extraction function:
- Contributions are isolated, tokenized, and classified.
- They are replaced in the speech content with a placeholder.
- Metadata for each contribution is stored separately.
- Here an external extraction function extract(…) is used: it can be found in [extract_contributions.py](../extract_contributions.py).

This parser-step is central to transforming raw XML session data into:
- Clean structured speech datasets suitable for NLP tasks
- Detailed speaker-level contributions with speaker and faction alignment

Internal Highlights:
- Position Handling: Robust mapping of fraktion and rolle tags to standard position categories.
- Speaker Disambiguation: Matching via speaker_id or fuzzy name search using first_name and last_name.
- Date Handling: Converts sitzung-datum to Unix time.
- Failsafes: Manual correction for broken XML dates (e.g., session 19158).


### **Input:**
```
dataStage02/
├── data19xmlSplit/
│   ├── 19001/
│   │   ├── appendix.xml
│   │   ├── meta_data.xml
│   │   ├── toc.xml
│   │   └── session_content.xml
│   ├── 19002/ …
│   └── …
├── data20xmlSplit/
│   ├── 20001/
│   │   ├── appendix.xml
│   │   ├── meta_data.xml
│   │   ├── toc.xml
│   │   └── session_content.xml
│   ├── 20002/ …
│   └── …
dataStage03/
├── dataPoliticiansStage03/
│   ├── politicians.csv
├── dataFactionsStage03/
│   ├── factionsAbbreviations.pkl
│   └── speaker_faction_lookup.csv
```

### **Ouput:**
```
dataStage04/
├── contributionsExtended/
│   ├── electoral_term_19/
│   │   ├── 19001.pkl
│   │   └── …
│   ├── electoral_term_20/
│   │   ├── 20001.pkl
│   │   └── …
├── contributionsSimplified/
│   ├── contributions_simplified_19.pkl
│   ├── contributions_simplified_20.pkl
│   └── contributions_simplified_19_20.pkl
├── speechContent/
│   ├── electoral_term_19/
│   │   └── speech_content.pkl
│   ├── electoral_term_20/
│   │   └── speech_content.pkl
dataFinalStage/
├── contributionsSimplified/
│   ├── contributions_simplified_19.pkl
│   ├── contributions_simplified_20.pkl
│   └── contributions_simplified_19_20.pkl
dataExcel/
├── finalStage/
│   ├── contributions_simplified_19.pkl
│   ├── contributions_simplified_20.pkl
│   └── contributions_simplified_19_20.pkl
```


**Columns (speech_content_--)**:
| Column name      | Description                                                  |
|------------------|--------------------------------------------------------------|
| `id`             | Unique ID of the speech                                      |
| `session`        | Session number (e.g. "001")                                  |
| `first_name`     | First name(s) of the speaker                                 |
| `last_name`      | Last name of the speaker                                     |
| `faction_id`     | Integer ID of the faction                                    |
| `position_short` | Role class (Chancellor, Guest, Member of Parliament, Minister, Not found, Presidium of Parliament, Secretary of State)              |
| `position_long`  | Full title (e.g. "Parliamentary Secretary of State")         |
| `politician_id`  | Foreign key to matched person (or -1 if unknown)             |
| `speech_content` | Full plain text of the speech and text position of contributions e.g. ({1}) |
| `date`           | Date of the session in Unix timestamp format (seconds since 1970) |

**Columns (contributions_simplified_--)**:
| Column name      | Description                                 |
|------------------|---------------------------------------------|
| `text_position`  | Character index in the speech text          |
| `content`        | Extracted contribution (e.g. "[Applause]")  |
| `speech_id`      | Foreign key to the speech this belongs to   |

In [None]:
# Input directories
ELECTORAL_TERM_19_20_INPUT = {
    19: PATHS.XML_SPLIT_19,
    20: PATHS.XML_SPLIT_20,
}

FACTIONS = PATHS.FACTIONS_ABBR_STAGE03
politicians = PATHS.MPS_FACTIONS_STAGE03
LOOKUP = PATHS.SPEAKER_LOOKUP_STAGE03
lookup = pd.read_csv(LOOKUP)

# Output directories
base_output = PATHS.STAGE04
ELECTORAL_TERM_19_20_OUTPUT = PATHS.SPEECH_CONTENT_19.parent
CONTRIBUTIONS_SIMPLIFIED = PATHS.CONTRIB_SIMPLIFIED
CONTRIBUTIONS_EXTENDED = PATHS.CONTRIB_EXT_19.parent

ELECTORAL_TERM_19_20_OUTPUT.mkdir(parents=True, exist_ok=True)
CONTRIBUTIONS_SIMPLIFIED.mkdir(parents=True, exist_ok=True)
CONTRIBUTIONS_EXTENDED.mkdir(parents=True, exist_ok=True)

faction_patterns = {
    "Bündnis 90/Die Grünen": r"(?:BÜNDNIS\s*(?:90)?/?(?:\s*D[1I]E)?|Bündnis\s*90/(?:\s*D[1I]E)?)?\s*[GC]R[UÜ].?\s*[ÑN]EN?(?:/Bündnis 90)?",  # noqa: E501
    "CDU/CSU": r"(?:Gast|-)?(?:\s*C\s*[DSMU]\s*S?[DU]\s*(?:\s*[/,':!.-]?)*\s*(?:\s*C+\s*[DSs]?\s*[UÙ]?\s*)?)(?:-?Hosp\.|-Gast|1)?",  # noqa: E501
    "BP": r"^BP",
    "DA": r"^DA",
    "DP": r"^DP",
    "DIE LINKE.": r"DIE LINKE",
    "DPB": r"^DPB",
    "DRP": r"DRP(\-Hosp\.)?|^SRP|^DBP",
    "FDP": r"\s*F\.?\s*[PDO][.']?[DP]\.?",
    "Fraktionslos": r"(?:fraktionslos|Parteilos)",
    "FU": r"^FU",
    "FVP": r"^FVP",
    "Gast": r"Gast",
    "GB/BHE": r"(?:GB[/-]\s*)?BHE(?:-DG)?",
    "KPD": r"^KPD",
    "NR": r"^NR$",
    "PDS": r"(?:Gruppe\s*der\s*)?PDS(?:/(?:LL|Linke Liste))?",
    "SPD": r"\s*'?S(?:PD|DP)(?:\.|-Gast)?",
    "SSW": r"^SSW",
    "SRP": r"^SRP",
    "WAV": r"^WAV",
    "Z": r"^Z$",
    "AfD": r"^AfD$",
    "DBP": r"^DBP$",
}


def get_position_short_and_long(position_raw):
    """matches the given position_raw and returns the long and short version"""
    if position_raw in faction_patterns.keys() or regex.match(
        r"^[Bb]erichterstatter(in)?(\s|$|,|.)", position_raw
    ):
        return (
            "Member of Parliament",
            None if position_raw in faction_patterns.keys() else position_raw,
        )
    elif (
        regex.match(r"^[Bb]undestagspräsident(in)?(\s|$|,|.)", position_raw)
        or regex.match(r"^[Aa]lterspräsident(in)?(\s|$|,|.)", position_raw)
        or regex.match(r"^[Vv]izebundestagspräsident(in)?(\s|$|,|.)", position_raw)
        or regex.match(r"^[Ss]chriftführer(in)?(\s|$|,|.)", position_raw)
        or position_raw.lower()
        in [
            "präsidentin",
            "präsident",
            "präsident des deutschen bundestages",
            "präsidentin des deutschen bundestages",
            "vizepräsidentin",
            "vizepräsident",
        ]
    ):
        return "Presidium of Parliament", position_raw
    elif (
        regex.match(r"^[Bb]undespräsident(in)?(\s|$|,|.)", position_raw)
        or regex.match(r"^[Mm]inisterpräsident(in)?(\s|$|,|.)", position_raw)
        or regex.match(r"^[Ss]taatsminister(in)?(\s|$|,|.)", position_raw)
        or regex.match(r"^[Ss]enator(in)?(\s|$|,|.)", position_raw)
        or regex.match(r"^[Pp]räsident(in)?(\s|$|,|.)", position_raw)
        or regex.match(r"^[Gg]ast", position_raw)
    ):
        return "Guest", position_raw
    elif regex.match(r"^[Bb]undeskanzler(in)?(\s|$|,|.)", position_raw):
        return "Chancellor", None
    elif regex.match(r"^(Bundes)?[Mm]inister(in)?(\s|$|,|.)", position_raw):
        return "Minister", position_raw
    elif regex.match(
        r"^([Pp]arl\s*\.\s+)?[Ss]taatssekretär(in)?(\s|$|,|.)", position_raw
    ):
        return "Secretary of State", position_raw
    else:
        return "Not found", None


def get_first_last(name):
    first_last = name.split()
    if len(first_last) == 1:
        first_name = ""
        last_name = first_last[0]
    elif len(first_last) >= 2:
        first_name = first_last[:-1]
        last_name = first_last[-1]
    else:
        first_name = "ERROR"
        last_name = "ERROR"
    return " ".join(first_name), last_name

def find_with_default(node, key, default):
    result = node.find(key)
    return default if result is None else result.text

def get_faction_abbrev(faction, faction_patterns):
    """matches the given faction and returns an id"""

    for faction_abbrev, faction_pattern in faction_patterns.items():
        if regex.search(faction_pattern, faction):
            return faction_abbrev
    return None


speech_content_id = 1000000

speech_content = pd.DataFrame(
    {
        "id": [],
        "session": [],
        "first_name": [],
        "last_name": [],
        "faction_id": [],
        "position_short": [],
        "position_long": [],
        "politician_id": [],
        "speech_content": [],
        "date": [],
    }
)

factions = pd.read_pickle(FACTIONS)

politicians = pd.read_csv(politicians)
politicians["last_name"] = politicians["last_name"].str.lower()
politicians["last_name"] = politicians["last_name"].str.replace("ß", "ss", regex=False)
politicians["first_name"] = politicians["first_name"].str.lower()
politicians["first_name"] = politicians["first_name"].str.replace("ß", "ss", regex=False)
politicians["first_name"] = politicians["first_name"].apply(str.split)

for term_number, folder_path in ELECTORAL_TERM_19_20_INPUT.items():
    if not folder_path.is_dir():
        continue

    if term_number is None:
        continue

    contributions_extended_output = CONTRIBUTIONS_EXTENDED / f"electoral_term_{term_number}"
    term_spoken_content = ELECTORAL_TERM_19_20_OUTPUT / f"electoral_term_{term_number}"
    #contributions_simplified_output = CONTRIBUTIONS_SIMPLIFIED / f"electoral_term_{term_number}"

    contributions_extended_output.mkdir(parents=True, exist_ok=True)
    term_spoken_content.mkdir(parents=True, exist_ok=True)
    #contributions_simplified_output.mkdir(parents=True, exist_ok=True)

    speech_records = []

    contributions_simplified = []

    politicians_electoral_term = politicians.loc[
        politicians["electoral_term"] == term_number
    ]

    for session_path in tqdm(sorted(folder_path.iterdir()), desc=f"Wahlperiode {term_number}"):
        if not session_path.is_dir():
            continue

        contributions_extended = []

        session_content = et.parse(session_path / "session_content.xml")
        meta_data = et.parse(session_path / "meta_data.xml")

        date = meta_data.getroot().get("sitzung-datum")
        # Wrong date in xml file. Fixing manually
        if session_path.stem == "19158":
            date = "07.05.2020"
        date = (
            datetime.strptime(date, "%d.%m.%Y") - datetime(1970, 1, 1)
        ).total_seconds()

        root = session_content.getroot()

        tops = root.findall("tagesordnungspunkt")

        id_Counter = 0

        for top in tops:
            speeches = top.findall("rede")
            for speech in speeches:
                speaker = speech[0].find("redner")
                if speaker is None:
                    continue
                try:
                    speaker_id = int(speaker.get("id"))
                except (ValueError, AttributeError):
                    speaker_id = -1
                name = speaker.find("name")
                first_name = find_with_default(name, "vorname", "")
                last_name = find_with_default(name, "nachname", "")



                position_raw_element = name.find("fraktion")
                if position_raw_element is not None and position_raw_element.text:
                    position_raw = position_raw_element.text
                else:
                    role_element = name.find("rolle")
                    if role_element is not None:
                        position_raw = find_with_default(role_element, "rolle_lang", "")
                    else:
                        position_raw = ""

                #position_raw = name.find("fraktion")
                #if position_raw is None:
                #    position_raw = name.find("rolle")
                #    if position_raw is not None:
                #        position_raw = find_with_default(position_raw, "rolle_lang", "")
                #    else:
                #        position_raw = ""
                #else:
                #    position_raw = ""

                faction_abbrev = get_faction_abbrev(
                    str(position_raw), faction_patterns=faction_patterns
                )
                position_short, position_long = get_position_short_and_long(
                    faction_abbrev
                    if faction_abbrev
                    else regex.sub("\n+", " ", position_raw)
                )
                faction_id = -1

                if faction_abbrev:
                    # .iloc[0] is important right now, as some faction entries
                    # in factions df share same faction_id, so always the first
                    # one is chosen right now.
                    faction_id = int(
                        factions.loc[factions["abbreviation"] == faction_abbrev, "id"].iloc[0]
                    )

                if faction_id == -1:
                    row = lookup[(lookup["speaker_id"] == speaker_id) & (lookup["electoral_term"] == term_number)]
                    if not row.empty:
                        faction_id = int(row["faction_id"].iloc[0])
                    else:
                        faction_id = -1


                # additional logic for Minister
                #if position_short == "Minister" and speaker_id != -1 and faction_id == -1:
                #    # Politiker-Daten passend zur Wahlperiode und speaker_id filtern
                #    politician_rows = politicians[
                #        (politicians["ui"] == speaker_id) & (politicians["electoral_term"] == term_number)
                #    ]
                #    politician_ids = politician_rows.reset_indesx(drop=True)

                #    faction_id_candidate = -1

                #    for fid in politician_rows["faction_id"]:
                #        if fid != -1:
                #            faction_id_candidate = fid

                #    if (faction_id_candidate == -1) and (first_name == "Nancy") and (last_name == "Faeser"):
                #        faction_id_candidate = 25

                #    faction_id = faction_id_candidate


                speech_text = ""
                text_position = 0
                for content in speech[1:]:
                    tag = content.tag
                    if tag == "name":
                        speech_records.append(
                            {
                                "id": speech_content_id,
                                "session": session_path.stem,
                                "first_name": first_name,
                                "last_name": last_name,
                                "faction_id": faction_id,
                                "position_short": position_short,
                                "position_long": position_long,
                                "politician_id": speaker_id,
                                "speech_content": speech_text,
                                "date": date,
                            }
                        )
                        speech_content_id += 1
                        faction_id = -1
                        speaker_id = -1
                        name = regex.sub(":", "", content.text).split()
                        first_name, last_name = get_first_last(" ".join(name[1:]))
                        position_short, position_long = get_position_short_and_long(
                            name[0]
                        )
                        possible_matches = politicians_electoral_term.loc[
                            politicians_electoral_term["last_name"] == last_name.lower()
                        ]
                        length = len(np.unique(possible_matches["ui"]))
                        if length == 1:
                            speaker_id = int(possible_matches["ui"].iloc[0])
                        elif length > 1:
                            first_name_set = set(
                                [x.lower() for x in first_name.split()]
                            )
                            possible_matches = possible_matches.loc[
                                ~possible_matches["first_name"].apply(
                                    lambda x: set(x).isdisjoint(first_name_set)
                                )
                            ]
                            length = len(np.unique(possible_matches["ui"]))
                            if length == 1:
                                speaker_id = int(possible_matches["ui"].iloc[0])
                        speech_text = ""
                        text_position = 0
                    elif tag == "p" and content.get("klasse") == "redner":
                        speech_records.append(
                            {
                                "id": speech_content_id,
                                "session": session_path.stem,
                                "first_name": first_name,
                                "last_name": last_name,
                                "faction_id": faction_id,
                                "position_short": position_short,
                                "position_long": position_long,
                                "politician_id": speaker_id,
                                "speech_content": speech_text,
                                "date": date,
                            }
                        )

                        speech_content_id += 1
                        speech_text = ""
                        text_position = 0
                        speaker = content.find("redner")
                        speaker_id = int(speaker.get("id"))
                        possible_matches = politicians_electoral_term.loc[
                            politicians_electoral_term["ui"] == speaker_id
                        ]
                        if len(possible_matches) == 0:
                            speaker_id = -1
                        name = speaker.find("name")
                        try:
                            first_name = name.find("vorname").text
                            last_name = name.find("nachname").text
                        except AttributeError:
                            try:
                                first_name, last_name = get_first_last(speech[0].text)
                            except AttributeError:
                                first_name = "ERROR"
                                last_name = "ERROR"
                        try:
                            position_raw = name.find("fraktion").text
                        except (ValueError, AttributeError):
                            position_raw = name.find("rolle").find("rolle_lang").text
                        faction_abbrev = get_faction_abbrev(
                            str(position_raw), faction_patterns=faction_patterns
                        )

                        faction_id = -1
                        position_short, position_long = get_position_short_and_long(
                            faction_abbrev
                            if faction_abbrev
                            else regex.sub("\n+", " ", position_raw)
                        )
                        if faction_abbrev:
                            faction = faction_abbrev
                            # .iloc[0] is important right now, as some faction entries
                            # in factions df share same faction_id, so always the first
                            # one is chosen right now.
                            faction_id = int(
                                factions.loc[factions["abbreviation"] == faction_abbrev, "id"].iloc[0]
                            )
                    elif tag == "p":
                        try:
                            speech_text += "\n\n" + content.text
                        except TypeError:
                            pass
                    elif tag == "kommentar":
                        (
                            contributions_extended_frame,
                            speech_replaced,
                            contributions_simplified_frame,
                            text_position,
                        ) = extract(
                            content.text,
                            int(session_path.stem),
                            speech_content_id,
                            text_position,
                            False,
                        )
                        speech_text += "\n\n" + speech_replaced
                        contributions_extended.append(contributions_extended_frame)
                        contributions_simplified.append(contributions_simplified_frame)

                speech_records.append(
                    {
                        "id": speech_content_id,
                        "session": session_path.stem,
                        "first_name": first_name,
                        "last_name": last_name,
                        "faction_id": faction_id,
                        "position_short": position_short,
                        "position_long": position_long,
                        "politician_id": speaker_id,
                        "speech_content": speech_text,
                        "date": date,
                    }
                )
                speech_content_id += 1

        contributions_extended = pd.concat(contributions_extended, sort=False)
        contributions_extended.to_pickle(contributions_extended_output / f"{session_path.stem}.pkl")

    speech_content = pd.DataFrame.from_records(speech_records)
    speech_content.to_pickle(term_spoken_content / "speech_content.pkl")

    contributions_simplified = pd.concat(contributions_simplified, sort=False)
    contributions_simplified.to_pickle(CONTRIBUTIONS_SIMPLIFIED / f"contributions_simplified_{term_number}.pkl")
    PATHS.FINAL_CONTRIB_SIM.mkdir(parents=True, exist_ok=True)
    contributions_simplified.to_pickle(PATHS.FINAL_CONTRIB_SIM / f"contributions_simplified_{term_number}.pkl")

     # Save contributions simplified to Excel
    contributions_simplified.to_excel(PATHS.EXCEL_FINAL_STAGE / f"contributions_simplified_{term_number}.xlsx", index=False)


# Dictionary to collect contribution simplified periods
simplified_contribs_by_term = {}

for term_number in [19, 20]:
    simplified_path = CONTRIBUTIONS_SIMPLIFIED / f"contributions_simplified_{term_number}.pkl"
    df = pd.read_pickle(simplified_path)
    simplified_contribs_by_term[term_number] = df

# make combinedcontributions simplified file
if simplified_contribs_by_term:
    combined_df = pd.concat(simplified_contribs_by_term.values(), ignore_index=True)
    combined_df.to_pickle(CONTRIBUTIONS_SIMPLIFIED / "contributions_simplified_19_20.pkl")
    combined_df.to_pickle(PATHS.FINAL_CONTRIB_SIM / f"contributions_simplified_19_20.pkl")
    combined_df.to_excel(PATHS.EXCEL_FINAL_STAGE  / f"contributions_simplified_19_20.xlsx", index=False)


# Save speech content to Excel
df = pd.read_pickle(PATHS.SPEECH_CONTENT_04_19 / "speech_content.pkl")
df.to_excel(PATHS.EXCEL_SPEECH_STAGE04_19, index=False)
df = pd.read_pickle(PATHS.SPEECH_CONTENT_04_20 / "speech_content.pkl")
df.to_excel(PATHS.EXCEL_SPEECH_STAGE04_20, index=False)

print(f"speeches saved.")