In [1]:
import os
import json
from datetime import datetime
from lxml import etree

In [2]:
def loadSpeeches(directory: str):
    parser = etree.XMLParser(dtd_validation=False)
    filenames = [os.path.join(directory, file) for file in os.listdir(directory)]
    return [etree.parse(file, parser) for file in filenames]


In [3]:
legislative_period = 20

speeches_path = f'../../data/data_{legislative_period}/speeches'

speeches_xml = loadSpeeches(speeches_path)

print(len(speeches_xml))
print(speeches_xml[0].getroot().attrib.get('sitzung-datum'))

42
26.10.2021


In [15]:
def parseNames(names):
    name_dict = {}
    
    # Use last added name from names. The lateron relation between mdb and speech is done via the id and not the name!
    name = names.getchildren()[-1].getchildren()
    
    name_dict['surname'] = name[0].text
    name_dict['forename'] = name[1].text
    name_dict['salutation_title'] = name[5].text
    name_dict['academic_title'] = name[6].text
    
    return name_dict

def parseBiography(biography):
    bio_dict = {}
    
    bio = biography.getchildren()
    
    bio_dict['birth_date'] = bio[0].text
    bio_dict['birth_place'] = bio[1].text
    bio_dict['birth_country'] = bio[2].text
    bio_dict['gender'] = bio[4].text
    bio_dict['civil_status'] = bio[5].text
    bio_dict['religion'] = bio[6].text
    bio_dict['profession'] = bio[7].text
    bio_dict['party'] = bio[8].text
    
    return bio_dict

def parseLegislativePeriod(period_elem):
    period_dict = {}
    
    period = period_elem.getchildren()    
    
    period_dict['period_number'] = period[0].text
    period_dict['wkr_number'] = period[3].text
    period_dict['wkr_name'] = period[4].text
    period_dict['wkr_land'] = period[5].text
    period_dict['mandat_type'] = period[7].text
    
    return period_dict
    
def parseMdb(mdb):
    children = mdb.getchildren()
    
    mdb_dict = {}
    
    mdb_dict['id'] = children[0].text
    mdb_dict['name'] = parseNames(children[1])
    mdb_dict['biography'] = parseBiography(children[2])
    mdb_dict['legislative_periods'] = [parseLegislativePeriod(period) for period in children[3].getchildren()]
    
    return mdb_dict
        

def getMdbCoredata():
    mdb_core_data_path = '../../data/MdB-Stammdaten-data/MDB_STAMMDATEN.XML'
    mdb_parser = etree.XMLParser(dtd_validation=False)
    root = etree.parse(mdb_core_data_path)
    
    mdbs = root.findall('.//MDB')
    
    return [parseMdb(mdb) for mdb in mdbs]
    

In [18]:
mdb_data = getMdbCoredata()

print(len(mdb_data))

print(*mdb_data[:3],sep='\n')

4366
{'id': '11000001', 'name': {'surname': 'Abelein', 'forename': 'Manfred', 'salutation_title': 'Dr.', 'academic_title': 'Prof. Dr.'}, 'biography': {'birth_date': '20.10.1930', 'birth_place': 'Stuttgart', 'birth_country': None, 'gender': 'männlich', 'civil_status': 'keine Angaben', 'religion': 'katholisch', 'profession': 'Rechtsanwalt, Wirtschaftsprüfer, Universitätsprofessor', 'party': 'CDU'}, 'legislative_periods': [{'period_number': '5', 'wkr_number': '174', 'wkr_name': None, 'wkr_land': 'BWG', 'mandat_type': 'Direktwahl'}, {'period_number': '6', 'wkr_number': '174', 'wkr_name': None, 'wkr_land': 'BWG', 'mandat_type': 'Direktwahl'}, {'period_number': '7', 'wkr_number': '174', 'wkr_name': None, 'wkr_land': 'BWG', 'mandat_type': 'Direktwahl'}, {'period_number': '8', 'wkr_number': '174', 'wkr_name': None, 'wkr_land': 'BWG', 'mandat_type': 'Direktwahl'}, {'period_number': '9', 'wkr_number': '174', 'wkr_name': None, 'wkr_land': 'BWG', 'mandat_type': 'Direktwahl'}, {'period_number': '10

In [47]:
def write_dict_to_json_file(dictionary):
    with open('parsed_mdbs.json', 'w') as outfile:
        json.dump(dictionary, outfile, indent=2)
    
def read_mdbs_from_json():
    obj = json.loads(open('parsed_mdbs.json').read())
    return obj

In [50]:
write_dict_to_json_file(mdb_data)

In [93]:
mdbs = read_mdbs_from_json()

In [92]:
print(len(mdbs))
current_mdbs = []

for mdb in mdbs:
    name = mdb['name']['forename'] + ' ' + mdb['name']['surname']
    if 20 == int(mdb['legislative_periods'][-1]['period_number']):
        current_mdbs.append(mdb)

print(len(current_mdbs))
print(*current_mdbs[:2], sep='\n')

4366
723
{'id': '11001235', 'name': {'surname': 'Kubicki', 'forename': 'Wolfgang', 'salutation_title': None, 'academic_title': None}, 'biography': {'birth_date': '03.03.1952', 'birth_place': 'Braunschweig', 'birth_country': None, 'gender': 'männlich', 'civil_status': 'verheiratet, 2 Kinder', 'religion': None, 'profession': 'Vizepräsident DBT, Rechtsanwalt, Dipl.-Volkswirt', 'party': 'FDP'}, 'legislative_periods': [{'period_number': '12', 'wkr_number': None, 'wkr_name': None, 'wkr_land': None, 'mandat_type': 'Landesliste'}, {'period_number': '15', 'wkr_number': None, 'wkr_name': None, 'wkr_land': None, 'mandat_type': 'Landesliste'}, {'period_number': '19', 'wkr_number': None, 'wkr_name': None, 'wkr_land': None, 'mandat_type': 'Landesliste'}, {'period_number': '20', 'wkr_number': None, 'wkr_name': None, 'wkr_land': None, 'mandat_type': 'Landesliste'}]}
{'id': '11001772', 'name': {'surname': 'Ramsauer', 'forename': 'Peter', 'salutation_title': 'Dr.', 'academic_title': 'Dr.'}, 'biography':