# Und das Wort ist Fleisch geworden
* https://textmining.wp.hs-hannover.de/Preprocessing.html
* Tagset: https://homepage.ruhr-uni-bochum.de/stephen.berman/Korpuslinguistik/Tagsets-STTS.html

In [2]:
import pandas as pd
import nltk
import json
from pathlib import Path
from HanTa import HanoverTagger as ht
import glob
import math

# Do once
#nltk.download('punkt')

## Lemmatize / Tag and store as CSV (only once, then goto: next cell)
Store lemmatized and tagged data as json because of performance issues. Loading a csv with (many!) dicts is very slow and like flying from BER...  
Split it up to store it on github (max filesize = 100mb)

In [3]:
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

In [4]:
df_votum_raw = pd.concat([
    pd.read_csv(Path('../export/votum/votum_0.csv')),
    pd.read_csv(Path('../export/votum/votum_1.csv'))
])

# Remove empty texts
df_votum_raw = df_votum_raw[df_votum_raw.text.notna()]

In [5]:
# Start tagging
records = []
for i, row in df_votum_raw.iterrows():
    lemm = nltk.tokenize.word_tokenize(row['text'], language='german')

    records.append({
        'sitzung_date': row['sitzung_date'],
        'dokument_titel': row['dokument_titel'],
        'name': row['name'],
        'vorname': row['vorname'],
        'partei': row['partei'],
        'jahrgang': row['jahrgang'],
        'geschlecht': row['geschlecht'],
        'funktion': row['funktion'],
        'ismember': row['ismember'],
        'tags': tagger.tag_sent(lemm, taglevel=1),
    })

In [None]:
# Split
chunks = 7
bucketsize = math.ceil(len(records) / chunks)
for i in range(0, chunks):
    subrecords = []
    for j in range(bucketsize * i, bucketsize * ( i + 1 )):
        if j < len(records):
            subrecords.append(records[j])

    # Store
    with open(Path('../export/tags/tag_%s.json' % i), 'w', encoding='utf-8') as f:
        f.write(json.dumps(subrecords, ensure_ascii=False))

print("finito")

## Load stemmatized Data

In [None]:
records = []
for f in glob.glob(str(Path('../export/tags/*.json'))):
    records = records + json.load(open(f, 'r', encoding='utf-8'))

print(len(records))

75264


In [None]:
# Members only, no Presidents
r_members = list(filter(lambda x: x['ismember'] == True, records))
r_members = list(filter(lambda x: x['funktion'] not in ['Präsidium', '2. Vizepräsidium', '1. Vizepräsidium'], r_members))
print(len(r_members))

55623


## Analyse

In [None]:
# Select only Nouns
list_m = []
list_w = []
for r in r_members:
    tags = [lemma for (word,lemma,pos) in r['tags'] if pos == "NN" or pos == "NE"]
    if r['geschlecht'] == 'm': list_m = list_m + tags
    elif r['geschlecht'] == 'w': list_w = list_w + tags

In [None]:
def get_freqdist_df(l):
    fdist = nltk.FreqDist(l)
    fdist = fdist.most_common(100)
    df_dist = pd.DataFrame(fdist, columns=['w', 'count'])
    df_dist.reset_index(drop=False, inplace=True)
    return df_dist

df_m = get_freqdist_df(list_m)
df_m['geschlecht'] = 'm'

df_w = get_freqdist_df(list_w)
df_w['geschlecht'] = 'w'

df_concat = pd.concat([df_w, df_m])

# Remove duplicates
df_concat = df_concat.drop_duplicates(subset=['w'], keep = False)

df_concat

Unnamed: 0,index,w,count,geschlecht
57,57,Jugendliche,1902,w
60,60,Familie,1820,w
65,65,Angebot,1768,w
85,85,Eltern,1487,w
87,87,Umsetzung,1467,w
88,88,Universität,1461,w
91,91,Schüler,1369,w
93,93,Bildung,1314,w
94,94,Personal,1302,w
95,95,Grundlage,1279,w


In [None]:
list(df_concat[df_concat.geschlecht == 'w']['w'])

['Jugendliche',
 'Familie',
 'Angebot',
 'Eltern',
 'Umsetzung',
 'Universität',
 'Schüler',
 'Bildung',
 'Personal',
 'Grundlage',
 'Folge']

In [None]:
list(df_concat[df_concat.geschlecht == 'm']['w'])

['Herr',
 'Sache',
 'Wort',
 'Vorstoss',
 'Regierungsrat',
 'Winterthur',
 'Parlament',
 'Schluß',
 'Geschäft',
 'Politik',
 'Evp']