In [None]:
# imports
import os
import json
import glob
from pathlib import Path
import xml.etree.ElementTree as et
import regex
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime
import time
import zipfile
import io

# helper functions and constants
from dataGeneration.extract_contributions import extract
from dataGeneration.clean_text import clean_name_headers
from dataGeneration.match_names import insert_politician_id_into_contributions_extended
import paths as PATHS

# **2 Factions:**

## **2.1 Extract & Normalize Factions from MP Metadata**

- Extracts
a
unique
list
of
factions
from the structured

MP
dataset(mps.pkl), which
are
officially
listed as institutional
affiliations
for MPs in the Bundestag.
- Since
some
faction
names
appear in speech
data
but not in the
structured
MP
data(e.g.historical or special
cases), this script manually appends missing entries to the list of known factions to ensure consistency in downstream data processing.



### **Input:**
```
dataStage02 /
├── dataPoliticiansStage02 /
│   └── mps.pkl
```

### **Ouput:**
```
dataStage02
├── dataFactionsStage02 /
│   └── factions.pkl
dataExcel /
└── factions_stage02.xlsx
```

** Columns(factions.pkl): **
| Column
name | Description |
| ---------------- | ------------------------------------------------- |
| `faction_name` | Full
name
of
the
faction or parliamentary
group |

In [None]:
# Input and output paths
input_politiker_pfad = PATHS.POLITICIANS_STAGE02
save_factions_pfad = PATHS.FACTIONS_STAGE02.parent
save_factions_pfad.mkdir(parents=True, exist_ok=True)

# read data
mps = pd.read_pickle(input_politiker_pfad)
factions = mps.loc[mps["institution_type"] == "Fraktion/Gruppe", "institution_name"]

unique_factions = np.unique(factions)
unique_factions = np.append(
    unique_factions,
    [
        "Südschleswigscher Wählerverband",
        "Gast",
        "Gruppe Nationale Rechte",
        "Deutsche Soziale Union",
    ],
)

unique_factions = pd.DataFrame(unique_factions, columns=["faction_name"])
unique_factions.to_pickle(PATHS.FACTIONS_STAGE02)

# Als Excel speichern
PATHS.DATA_EXCEL_DIR.mkdir(parents=True, exist_ok=True)
unique_factions.to_excel(PATHS.EXCEL_FACTIONS_STAGE02, index=False)

print(f"{len(unique_factions)} Fraktionen gespeichert unter {PATHS.FACTIONS_STAGE02}")

## **2.2 Add abbreviations to factions**

Augments the previously extracted list of faction names by
- assigning standardized abbreviations to each entry
- and then assigning a unique integer id to each abbreviation
- before outputting as a structured table.


### **Input:**
```
dataStage02
├── dataFactionsStage02/
│   └── factions.pkl
```

### **Ouput:**
```
dataStage03/
├── dataFactionsStage03/
│   └── factionsAbbreviations.pkl
dataExcel/
└── factionsAbbreviations_stage03.xlsx
```


**Columns (factionsAbbreviations.pkl):**
| Column name     | Description                                        |
|-----------------|----------------------------------------------------|
| `id`            | Unique numeric identifier for each faction         |
| `abbreviation`  | Shortened, normalized faction label (e.g., "SPD")  |
| `faction_name`  | Original full name of the faction                  |

In [None]:
# input directory
FACTIONS_STAGE_01 = PATHS.FACTIONS_STAGE02

# output directory
DATA_FINAL = PATHS.FACTIONS_ABBR_STAGE03.parent
DATA_FINAL.mkdir(parents=True, exist_ok=True)

# Load base factions
factions = pd.read_pickle(FACTIONS_STAGE_01)

abbreviations_dict = {
    "Alternative für Deutschland": "AfD",
    "Deutsche Soziale Union": "DSU",
    "Fraktion Alternative für Deutschland": "AfD",
    "Fraktion Bayernpartei": "BP",
    "Fraktion Bündnis 90/Die Grünen": "Bündnis 90/Die Grünen",
    "Fraktion DIE LINKE.": "DIE LINKE.",
    "Fraktion Die Linke" : "DIE LINKE.",
    "Fraktion DP/DPB (Gast)": "DP/DPB",
    "Fraktion DRP (Gast)": "DRP",
    "Fraktion Demokratische Arbeitsgemeinschaft": "DA",
    "Fraktion Deutsche Partei": "DP",
    "Fraktion Deutsche Partei Bayern": "DPB",
    "Fraktion Deutsche Partei/Deutsche Partei Bayern": "DP/DPB",
    "Fraktion Deutsche Partei/Freie Volkspartei": "DP/FVP",
    "Fraktion Deutsche Reichspartei": "DRP",
    "Fraktion Deutsche Reichspartei/Nationale Rechte": "DRP/NR",
    "Fraktion Deutsche Zentrums-Partei": "Z",
    "Fraktion Deutscher Gemeinschaftsblock der Heimatvertriebenen und Entrechteten": "BHE",
    "Fraktion Die Grünen": "Bündnis 90/Die Grünen",
    "Fraktion Die Grünen/Bündnis 90": "Bündnis 90/Die Grünen",
    "Fraktion BÜNDNIS 90/DIE GRÜNEN": "Bündnis 90/Die Grünen",
    "Fraktion Freie Volkspartei": "FVP",
    "Fraktion Föderalistische Union": "FU",
    "Fraktion Gesamtdeutscher Block / Block der Heimatvertriebenen und Entrechteten": "GB/BHE",
    "Fraktion WAV (Gast)": "WAV",
    "Fraktion Wirtschaftliche Aufbauvereinigung": "WAV",
    "Fraktion der CDU/CSU (Gast)": "CDU/CSU",
    "Fraktion der Christlich Demokratischen Union/Christlich - Sozialen Union": "CDU/CSU",
    "Fraktion der FDP (Gast)": "FDP",
    "Fraktion der Freien Demokratischen Partei": "FDP",
    "Fraktion der Kommunistischen Partei Deutschlands": "KPD",
    "Fraktion der Partei des Demokratischen Sozialismus": "PDS",
    "Fraktion der SPD (Gast)": "SPD",
    "Fraktion der Sozialdemokratischen Partei Deutschlands": "SPD",
    "Fraktionslos": "Fraktionslos",
    "Gruppe Bündnis 90/Die Grünen": "Bündnis 90/Die Grünen",
    "Gruppe BSW - Bündnis Sahra Wagenknecht - Vernunft und Gerechtigkeit": "BSW",
    "Gruppe Deutsche Partei": "DP",
    "Gruppe Die Linke": "DIE LINKE.",
    "Gruppe Kraft/Oberländer": "KO",
    "Gruppe der Partei des Demokratischen Sozialismus": "PDS",
    "Gruppe der Partei des Demokratischen Sozialismus/Linke Liste": "PDS",
    "Südschleswigscher Wählerverband": "SSW",
    "Gast": "Gast",
    "Gruppe Nationale Rechte": "NR",
}

# Assign abbreviation column
factions.insert(0, "abbreviation", "")
factions["abbreviation"] = factions["faction_name"].apply(lambda x: abbreviations_dict[x])

# Create unique faction ID
unique_abbreviations = np.unique(factions["abbreviation"])
faction_ids = list(range(len(unique_abbreviations)))

factions.insert(0, "id", -1)
for abbrev, id in zip(unique_abbreviations, faction_ids):
    factions.loc[factions["abbreviation"] == abbrev, "id"] = id

# Save as Pickle
factions.to_pickle(PATHS.FACTIONS_ABBR_STAGE03)
PATHS.DATA_FINAL_STAGE_DIR.mkdir(parents=True, exist_ok=True)
factions.to_pickle(PATHS.FINAL_FACTIONS_ABBREVIATIONS)

# Save as Excel
PATHS.DATA_EXCEL_DIR.mkdir(parents=True, exist_ok=True)
factions.to_excel(PATHS.EXCEL_FACTIONS_ABBR_STAGE03, index=False)

print(f"{len(unique_abbreviations)} Fraktionen und Abkürzungen gespeichert unter {PATHS.FACTIONS_ABBR_STAGE03}")