# Und das Wort ist Fleisch geworden
* https://textmining.wp.hs-hannover.de/Preprocessing.html
* Tagset: https://homepage.ruhr-uni-bochum.de/stephen.berman/Korpuslinguistik/Tagsets-STTS.html

In [1]:
import pandas as pd
import nltk
import json
from pathlib import Path
#import utils
from HanTa import HanoverTagger as ht
import glob
#import ast
#import functools
#import copy
#import matplotlib.pyplot as plt
import math

# Do once
#nltk.download('punkt')

## Lemmatize / Tag and store as CSV (only once, then goto: next cell)
Store lemmatized and tagged data as json because of performance issues. Loading a csv with (many!) dicts is very slow and like flying from BER...  
Split it up to store it on github (max filesize = 100mb)

In [2]:
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

In [None]:
df_votum_raw = pd.read_csv(Path('../export/votum.csv'))

# Remove empty texts
df_votum_raw = df_votum_raw[df_votum_raw.text.notna()]

In [24]:
# Start tagging
records = []
for i, row in df_votum_raw.iterrows():
    lemm = nltk.tokenize.word_tokenize(row['text'], language='german')

    records.append({
        'name': row['name'],
        'vorname': row['vorname'],
        'partei': row['partei'],
        'jahrgang': row['jahrgang'],
        'geschlecht': row['geschlecht'],
        'funktion': row['funktion'],
        'ismember': row['ismember'],
        'tags': tagger.tag_sent(lemm, taglevel=1),
    })

In [58]:

# Split
chunks = 5
bucketsize = math.ceil(len(records) / chunks)
for i in range(0, chunks):
    subrecords = []
    for j in range(bucketsize * i, bucketsize * ( i + 1 )):
        if j < len(records):
            subrecords.append(records[j])

    # Store
    with open(Path('../export/tags/tag_%s.json' % i), 'w', encoding='utf-8') as f:
        f.write(json.dumps(subrecords, ensure_ascii=False))

print("finito")

finito


## Load stemmatized Data

In [3]:
records = []
for f in glob.glob(str(Path('../export/tags/*.json'))):
    records = records + json.load(open(f, 'r', encoding='utf-8'))

print(len(records))

46521


In [5]:
# Members only, no Presidents
r_members = list(filter(lambda x: x['ismember'] == True, records))
r_members = list(filter(lambda x: x['funktion'] not in ['Präsidium', '2. Vizepräsidium', '1. Vizepräsidium'], r_members))
print(len(r_members))

34497


## Analyse

In [115]:
# Select only Nouns
list_m = []
list_w = []
for r in r_members:
    tags = [lemma for (word,lemma,pos) in r['tags'] if pos == "NN" or pos == "NE"]
    if r['geschlecht'] == 'm': list_m = list_m + tags
    elif r['geschlecht'] == 'w': list_w = list_w + tags

In [158]:
def get_freqdist_df(l):
    fdist = nltk.FreqDist(l)
    fdist = fdist.most_common(50)
    df_dist = pd.DataFrame(fdist, columns=['w', 'count'])
    df_dist.reset_index(drop=False, inplace=True)
    return df_dist

df_m = get_freqdist_df(list_m)
df_m['geschlecht'] = 'm'

df_w = get_freqdist_df(list_w)
df_w['geschlecht'] = 'w'

df_concat = pd.concat([df_w, df_m])

# Remove duplicates
df_concat = df_concat.drop_duplicates(subset=['w'], keep = False)

df_concat

Unnamed: 0,index,w,count,geschlecht
9,9,Kind,2561,w
20,20,Sp,1980,w
31,31,Frau,1631,w
32,32,Mensch,1630,w
33,33,Grüne,1616,w
40,40,Arbeit,1521,w
45,45,Schule,1449,w
48,48,Ziel,1399,w
22,22,Herr,4679,m
37,37,Rat,3720,m
