<a href="https://colab.research.google.com/github/saruman18/GermaParlTEI/blob/main/Data_Preparation_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reading in the Data

In [None]:
import glob
import pandas as pd
import xml.etree.ElementTree as ET
import os

In [None]:
class XMLParser:
    def __init__(self, directory_paths):
        self.directory_paths = directory_paths
        self.data = []
        self.speaker_map = {}
        self.df = None

    def extract_data_from_file(self, file_path):
        tree = ET.parse(file_path)
        root = tree.getroot()
        legislative_period = root.find(".//teiHeader/fileDesc/titleStmt/legislativePeriod")
        session_no = root.find(".//teiHeader/fileDesc/titleStmt/sessionNo")
        date = root.find(".//publicationStmt/date")
        legislative_period = legislative_period.text if legislative_period is not None else None
        session_no = session_no.text if session_no is not None else None
        date = date.text if date is not None else None

        for div in root.iter('div'):
            if div.attrib.get('type') == 'agenda_item':
                number = div.attrib.get('n')
                what = div.attrib.get('what')
                description = div.attrib.get('desc')

                for sp in div.findall('sp'):
                    speaker_name = sp.attrib.get('name')
                    party_affiliation = sp.attrib.get('party')
                    role = sp.attrib.get('role')
                    parliamentary_group = sp.attrib.get('parliamentary_group')

                    if speaker_name not in self.speaker_map:
                        speaker_number = len(self.speaker_map) + 1
                        self.speaker_map[speaker_name] = speaker_number
                    else:
                        speaker_number = self.speaker_map[speaker_name]

                    for p in sp.findall('p'):
                        speech_text = p.text
                        interjections_count = len(p.findall(".//stage[@type='interjection']"))

                        row_data = {
                            "Number": number,
                            "What": what,
                            "Description": description,
                            "Speaker": speaker_name,
                            "SpeakerNumber": speaker_number,
                            "SpeechText": speech_text,
                            "Party": party_affiliation,
                            "Role": role,
                            "ParliamentaryGroup": parliamentary_group,
                            "SessionNo": session_no,
                            "LegislativePeriod": legislative_period,
                            "Interjections": interjections_count,
                            "Date": date
                        }

                        self.data.append(row_data)

    def process_files(self):
        for directory_path in self.directory_paths:
            file_list = sorted(glob.glob(os.path.join(directory_path, "*.xml")),
                               key=lambda x: int(x.split("_")[2].split(".")[0]))
            for file_path in file_list:
                self.extract_data_from_file(file_path)

    def parse_xml_files(self):
        self.process_files()
        self.df = pd.DataFrame(self.data)

In [None]:
# Enter your directory path here
directory_paths = ["C:/Users/Engineering CSR20/Desktop/Deep/GermaParlTEI-main/19"]

xml_parser = XMLParser(directory_paths)

xml_parser.parse_xml_files()

df = xml_parser.df

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570716 entries, 0 to 570715
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Number              570716 non-null  object
 1   What                570716 non-null  object
 2   Description         570716 non-null  object
 3   Speaker             570716 non-null  object
 4   SpeakerNumber       570716 non-null  int64 
 5   SpeechText          570716 non-null  object
 6   Party               570716 non-null  object
 7   Role                570716 non-null  object
 8   ParliamentaryGroup  570716 non-null  object
 9   SessionNo           570716 non-null  object
 10  LegislativePeriod   570716 non-null  object
 11  Interjections       570716 non-null  int64 
 12  Date                570716 non-null  object
dtypes: int64(2), object(11)
memory usage: 56.6+ MB


In [None]:
df.Role.value_counts() # I think we need to keep only mp and government (definetly remove presidency!)

Role
mp                            281895
presidency                    260058
government                     27861
misc                             732
parliamentary_commissioner       170
Name: count, dtype: int64

In [None]:
# Keep only mp and government
df = df[df['Role'].isin(['mp', 'government'])]

In [None]:
df.Party.value_counts() # Keep only parties that are in the parliament, change LINKE to DIE LINKE

Party
CDU           70853
SPD           64763
AfD           45268
FDP           34280
GRUENE        33504
DIE LINKE     32745
CSU           24873
parteilos      2340
LKR             843
Die PARTEI      266
NA               21
Name: count, dtype: int64

In [None]:
df['Party'] = df['Party'].replace('LINKE', 'DIE LINKE') # we need to look for other such misclassification in other legislative periods

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Party'] = df['Party'].replace('LINKE', 'DIE LINKE') # we need to look for other such misclassification in other legislative periods


In [None]:
party_names = ['CDU', 'SPD', 'CSU', 'DIE LINKE', 'GRUENE', 'FDP', 'AfD'] # we need to change this depending on the legislative period
df = df[df['Party'].isin(party_names)]

In [None]:
df.What.value_counts()

What
motion                           130206
consultation                      68837
current_affairs                   29916
Unknown                           16717
section                           11114
report                            10513
questioning_of_the_government      9926
question_time                      8989
debate                             8019
government_declaration             6849
briefing                           2583
NA                                 1732
rules_of_procedure                  466
budget                              271
resolution                          102
election                             25
oath                                 19
objection                             2
Name: count, dtype: int64

In [None]:
whatnot = ['oath', 'election', 'NA', 'Unknown', 'rules_of_procedure','point_of_order', 'announcement','objection']
df = df[~df['What'].isin(whatnot)]

In [None]:
# change the date
df['Date'] = pd.to_datetime(df['Date'])
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df.head()

Unnamed: 0,Number,What,Description,Speaker,SpeakerNumber,SpeechText,Party,Role,ParliamentaryGroup,SessionNo,LegislativePeriod,Interjections,Date,year,month
72,2,resolution,Beschlussfassung über die,Carsten Schneider,2,Sehr geehrter Herr Präsident! Sehr geehrte Kol...,SPD,mp,SPD,1,19,0,2017-10-24,2017,10
73,2,resolution,Beschlussfassung über die,Carsten Schneider,2,In diesem Haus debattieren wir über die besten...,SPD,mp,SPD,1,19,0,2017-10-24,2017,10
74,2,resolution,Beschlussfassung über die,Carsten Schneider,2,Der Bundestag muss wieder zur zentralen Bühne ...,SPD,mp,SPD,1,19,0,2017-10-24,2017,10
75,2,resolution,Beschlussfassung über die,Carsten Schneider,2,"Ihr Politikstil, Frau Merkel, ist ein Grund da...",SPD,mp,SPD,1,19,0,2017-10-24,2017,10
76,2,resolution,Beschlussfassung über die,Carsten Schneider,2,Sie haben in diesem Wahlkampf jeden politische...,SPD,mp,SPD,1,19,0,2017-10-24,2017,10


Text Preprocessing

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Engineering
[nltk_data]     CSR20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Engineering
[nltk_data]     CSR20\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('german'))

def preprocess_text(text):
    if text is None:
        return None
    text = text.replace("-", " ").replace("/", " ")
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in party_names]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if not token.isdigit()]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

def preprocess_df(df):
    df['tokens'] = df['SpeechText'].apply(preprocess_text)
    return df

In [None]:
preprocess_df(df)

Unnamed: 0,Number,What,Description,Speaker,SpeakerNumber,SpeechText,Party,Role,ParliamentaryGroup,SessionNo,LegislativePeriod,Interjections,Date,year,month,tokens
72,2,resolution,Beschlussfassung über die,Carsten Schneider,2,Sehr geehrter Herr Präsident! Sehr geehrte Kol...,SPD,mp,SPD,1,19,0,2017-10-24,2017,10,Sehr geehrter Herr Präsident ! Sehr geehrte Ko...
73,2,resolution,Beschlussfassung über die,Carsten Schneider,2,In diesem Haus debattieren wir über die besten...,SPD,mp,SPD,1,19,0,2017-10-24,2017,10,In Haus debattieren besten politischen Lösunge...
74,2,resolution,Beschlussfassung über die,Carsten Schneider,2,Der Bundestag muss wieder zur zentralen Bühne ...,SPD,mp,SPD,1,19,0,2017-10-24,2017,10,Der Bundestag zentralen Bühne politischen Ause...
75,2,resolution,Beschlussfassung über die,Carsten Schneider,2,"Ihr Politikstil, Frau Merkel, ist ein Grund da...",SPD,mp,SPD,1,19,0,2017-10-24,2017,10,"Ihr Politikstil , Frau Merkel , Grund dafür , ..."
76,2,resolution,Beschlussfassung über die,Carsten Schneider,2,Sie haben in diesem Wahlkampf jeden politische...,SPD,mp,SPD,1,19,0,2017-10-24,2017,10,Sie Wahlkampf politischen Streit besseren Idee...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569951,Z1,consultation,Beratung der Beschlussempfehlung des Ausschuss...,Florian Toncar,224,Dennoch besteht nicht allzu viel Grund zum Sel...,FDP,mp,FDP,239,19,0,2021-09-07,2021,9,Dennoch besteht allzu Grund Selbstlob Feiern ;...
569952,Z1,consultation,Beratung der Beschlussempfehlung des Ausschuss...,Florian Toncar,224,"Der andere Grund, warum ich vor zu viel Euphor...",FDP,mp,FDP,239,19,0,2021-09-07,2021,9,"Der Grund , warum Euphorie warne : Der gestrig..."
569953,Z1,consultation,Beratung der Beschlussempfehlung des Ausschuss...,Florian Toncar,224,"Wenn man sich anschaut, wie dieses Gesetz zust...",FDP,mp,FDP,239,19,0,2021-09-07,2021,9,"Wenn anschaut , Gesetz zustande gekommen – For..."
569954,Z1,consultation,Beratung der Beschlussempfehlung des Ausschuss...,Florian Toncar,224,"Wir brauchen eine andere Mentalität, einen and...",FDP,mp,FDP,239,19,0,2021-09-07,2021,9,"Wir brauchen Mentalität , Anspruch Ambition Re..."


In [None]:
#df.to_pickle('BT19.pkl')
df.to_pickle('BT19.pkl')