# Clean extracted data
Cleanup the data and add personal informations (who spoke, which party was the person, etc.). This takes a while...

In [1]:
import json
import pandas as pd
from pathlib import Path
import re
import utils

In [2]:
# Get all files
df = pd.read_csv(Path('../export/dokumente.csv'))

# Only "Protokoll"
df = df[df.dokument_kategorie == 'Protokoll']

with open(Path('../export/mitglieder.json'), encoding='utf-8') as f:
    raete = json.load(f)

# Precalculate Name
for r in raete:
    r['_name'] = r['vorname'].lower() +' ' + r['name'].lower()

# Some names are written wrong. Some changed their name (marriage). Add
# Aliases to the dataframe

for a in utils.aliases:
    raete.append({
        '_name': a[0].lower(),
        'name': a[2],
        'vorname': a[1]})

In [3]:
# Remove this
remove = ['Ratsvizepräsidentin', 'Ratspräsidentin', 'Regierungsrätin',
     'Ratspräsident', 'Regierungsrat', 'Ratsvizepräsident', 'Vizepräsidentin', 'Vizepräsident', 
    ', Niederhasli.', ', SVP, Zürich.', ', SVP, Dinhard.',
    ', GLP, Dübendorf.', ', SVP, Volketswil.', ', GLP, Dübendorf.',
    ', SVP, Uetikon am See.']

records = []
notfound = []

# Loop file
for i, row in df.iterrows():

    # Create Name and load
    f = Path('../export/extracts/%s' % row['_filename']).with_suffix('.json')
    data = json.load(open(f, encoding='utf-8'))

    filename = Path(f).stem

    for msg in data:
        #print(msg)

        # Replace double space
        msg = msg.replace('  ', ' ')

        if msg.strip() == '':
            continue

        # Remove Strings
        for r in remove:
            msg = msg.replace(r, '')

        msg = msg.strip()

        # Extract Name
        r1 = re.findall(r"[^(:]*", msg)
        if len(r1) > 0:
            name = r1[0].strip()
            r = list(filter(lambda x: x['_name'] == name.lower(), raete))

            # If name not found, add it to a list for manual check
            if len(r) == 0:
                notfound.append({
                    'name': name,
                    'f': filename,
                })
                
            else:
                # Name found!

                # Clean text
                text = msg
                text = text.replace('-\n', '')
                text = text.replace('\n', ' ')
                text = text.replace('  ', ' ')
                text = text.strip()

                text = re.findall(r".*?:(.*)", text)
                if len(text) > 0:
                    text = text[0].strip()

                    records.append({
                        'name': r[0]['name'],
                        'vorname': r[0]['vorname'],
                        'text': text,
                        'f': filename
                    })
        else:
            # No Name found
            print('!! No Name found in:\n%s' % msg)

print("finito")

finito


### If you need to find missings, use this. When not, goto next block

In [4]:
len(notfound)

24945

In [18]:
x = pd.DataFrame(notfound)
x.to_clipboard()

In [30]:
# Clean notfounds
x.head()
x['keep'] = x['name'].apply(lambda x: ('.' in x) or (',' in x) or ('\n' in x))
x = x[x.keep == False]

x['name'] = x['name'].str.strip()

# At least one space
x['space'] = x['name'].apply(lambda x: ' ' in x)
x = x[x.space]

x['len'] = x['name'].apply(len)
x = x[x.len > 3]

x.to_clipboard()
x.head()

Unnamed: 0,name,f,keep,len,space
7,Ratssekretär Hans Peter Frei,2002-01-07-239d922f46224daf830d4ce5b15537ac-332,False,28,True
8,Ratssekretär Hans Peter Frei,2002-01-07-239d922f46224daf830d4ce5b15537ac-332,False,28,True
20,Regierungspräsident Markus Notter,2002-01-21-d2fd7954b0d143568a5850a14b8a834b-332,False,33,True
28,Regierungspräsident Markus Notter,2002-01-21-d2fd7954b0d143568a5850a14b8a834b-332,False,33,True
49,Ratssekretär Hans Peter Frei,2002-01-21-47cad2740d0b47329bc0408c3de53d19-332,False,28,True


In [75]:
x = list(filter(lambda x: 'Scheffe' in x['name'], raete))
for j in x:
    print("['', '%s', '%s']," % (j['vorname'], j['name']))

['', 'Elisabeth', 'Scheffeldt Kern'],


In [5]:
x = list(filter(lambda x: 'Alfredx' in x['vorname'], raete))

for j in x:
    print("['', '%s', '%s']," % (j['vorname'], j['name']))

## Join with personal information and save.
This takes a while

In [6]:
df_votum = pd.DataFrame(records)
df_dokumente = pd.read_csv(Path('../export/dokumente.csv', encoding='utf-8'))
df_dokumente['f'] = df_dokumente['_filename'].apply(lambda x: Path(x).stem)

# Join Votum and Dokumente
df_votum = df_votum.merge(df_dokumente[['f', 'sitzung_name', 'sitzung_date', 'sitzung_gremium', 'dokument_titel']], on = 'f')

# Merge names
df_votum['_join'] = df_votum.apply(lambda row: "%s_%s" % (row['name'], row['vorname']), axis=1)

# Typecast
df_votum['sitzung_date'] = pd.to_datetime(df_votum['sitzung_date'])

In [7]:
with open(Path('../export/mitglieder.json'), encoding='utf-8') as f:
    mitglieder = json.load(f)

# Typecast
utils.kantonsrat_to_datetime(mitglieder)

# No join, because could be multiple Partys per name (if they switched). Loop and find party
not_found = []

for i, row in df_votum.iterrows():
    record = None
    for r in mitglieder:
        if ("%s_%s" % (r['name'], r['vorname'])) == row['_join']:
            record = r
            break

    if record == None:
        not_found.append(row)
        continue

    # Find party
    party = None
    for l in record['partei']:
        if (l['start'] <= row['sitzung_date']) and ((l['end'] >= row['sitzung_date'])):
            party = l['bezeichnung']
            break

    # Is Kantonsrat (not former member and now Regierungsrat)
    ismember = False
    for l in record['einsitz']:
        if (l['start'] <= row['sitzung_date']) and ((l['end'] >= row['sitzung_date'])):
            ismember = True
            break      

    # Find Funktion
    function = None
    for l in record['funktion']:
        if (l['start'] <= row['sitzung_date']) and ((l['end'] >= row['sitzung_date'])):
            function = l['bezeichnung']
            break

    df_votum.loc[i, 'partei'] = party
    df_votum.loc[i, 'geschlecht'] = record['geschlecht']
    df_votum.loc[i, 'jahrgang'] = record['jahrgang']
    df_votum.loc[i, 'funktion'] = function
    df_votum.loc[i, 'ismember'] = ismember

if len(not_found) > 0:
    print("Could not match everything. Have a look at not_found")

## Split Dataframe (GitHub)

In [8]:
# Store
df_votum = df_votum.drop(columns=['_join'])
df_votum.iloc[0:30000].to_csv(Path('../export/votum/votum_0.csv'), index=False)
df_votum.iloc[30000:].to_csv(Path('../export/votum/votum_1.csv'), index=False)

In [9]:
not_found

[]