# Topic Modelling
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [7]:
import json
from pathlib import Path
import glob
from nltk.corpus import stopwords
from dateutil import parser

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [8]:
stop_words = stopwords.words('german')

## Load Data

In [6]:
records = []
for f in glob.glob(str(Path('../export/tags/*.json'))):
    records = records + json.load(open(f, 'r', encoding='utf-8'))

print(len(records))

KeyboardInterrupt: 

In [None]:
# Members only, no Presidents
r_members = list(filter(lambda x: x['ismember'] == True, records))
r_members = list(filter(lambda x: x['funktion'] not in ['Präsidium', '2. Vizepräsidium', '1. Vizepräsidium'], r_members))
print(len(r_members))

## Filter by date if needed

In [None]:
from_date = parser.parse("2016-05-01")
r_members = list(filter(lambda x: parser.parse(x['sitzung_date']) >= from_date, r_members))
print(len(r_members))

## Prepare Data

In [None]:
# Remove Stopwords and only keep noun, adj, vb and adv

keep = ['NN', 'NE',
    'ADJA', 'ADJD',
    'VMFIN', 'VAFIN', 'VVFIN', 'VAIMP', 'VVIMP', 'VVINF', 'VAINF', 'VMINF', 'VVIZU', 'VVPP', 'VMPP', 'VAPP',
    'ADV']

for votum in r_members:
    # Reversed Loop, we will remove items from the list during loop!
    for i in reversed(range(0, len(votum['tags']))):
        tag = votum['tags'][i]
        if tag[0].lower() in stop_words:

            # Stopword found. Remove
            votum['tags'].remove(tag)

        elif tag[2] not in keep:
            
            # Not tag we need. Remove
            votum['tags'].remove(tag)

In [31]:
# Stick the words together to get a nice list for our corpora and split by sex
data_m = []
data_w = []
for votum in r_members:
    if votum['geschlecht'] == 'm':
        data_m.append(list(map(lambda tag: tag[0], votum['tags'])))
    else:
        data_w.append(list(map(lambda tag: tag[0], votum['tags'])))

# Build Dictionary
id2word_m = corpora.Dictionary(data_m)
id2word_w = corpora.Dictionary(data_w)

# Term Document Frequency
corpus_m = [id2word_m.doc2bow(text) for text in data_m]
corpus_w = [id2word_w.doc2bow(text) for text in data_w]

## Go LDA

In [32]:
lda_model_m = gensim.models.ldamodel.LdaModel(corpus=corpus_m, id2word=id2word_m, num_topics=20, random_state=100, update_every=1,
                                            chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [33]:
lda_model_w = gensim.models.ldamodel.LdaModel(corpus=corpus_w, id2word=id2word_w, num_topics=20, random_state=100, update_every=1,
                                            chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [34]:
root = Path('../export/models/2016-05-01/')

# Store LDA
lda_model_m.save(str(root / Path('lda_m.model')))
lda_model_w.save(str(root / Path('lda_w.model')))

# Store corpus & id2word
with open(root / Path('corpus_m'), 'w', encoding='utf-8') as f:
    f.write(json.dumps(corpus_m, ensure_ascii = False))
id2word_m.save(str(root / Path('id2word_m')))

with open(root / Path('corpus_w'), 'w', encoding='utf-8') as f:
    f.write(json.dumps(corpus_w, ensure_ascii = False))
id2word_w.save(str(root / Path('id2word_w')))

## Load models, if needed

In [10]:
root = Path('../export/models/2016-05-01/')

In [11]:
# If needed...
lda_model_m = gensim.models.LdaModel.load(str(root / Path('lda_m.model')))
lda_model_w = gensim.models.LdaModel.load(str(root / Path('lda_w.model')))

In [12]:
id2word_m = gensim.corpora.dictionary.Dictionary.load(str(root / 'id2word_m'))
id2word_w = gensim.corpora.dictionary.Dictionary.load(str(root / 'id2word_w'))

corpus_m = json.loads(open(root / Path('corpus_m'), 'r', encoding='utf-8').read())
corpus_w = json.loads(open(root / Path('corpus_w'), 'r', encoding='utf-8').read())

## Men

In [13]:
pyLDAvis.enable_notebook()

In [14]:
lda_viz = gensimvis.prepare(lda_model_m, corpus_m, id2word_m)
lda_viz

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


## Women

In [15]:
lda_viz = gensimvis.prepare(lda_model_w, corpus_w, id2word_w)
lda_viz

  default_term_info = default_term_info.sort_values(


## 10 Years back

In [11]:
root = Path('../export/models/2011-05-01/')

lda_model_m = gensim.models.LdaModel.load(str(root / Path('lda_m.model')))
lda_model_w = gensim.models.LdaModel.load(str(root / Path('lda_w.model')))

id2word_m = gensim.corpora.dictionary.Dictionary.load(str(root / 'id2word_m'))
id2word_w = gensim.corpora.dictionary.Dictionary.load(str(root / 'id2word_w'))

corpus_m = json.loads(open(root / Path('corpus_m'), 'r', encoding='utf-8').read())
corpus_w = json.loads(open(root / Path('corpus_w'), 'r', encoding='utf-8').read())

### Men

In [12]:
lda_viz = gensimvis.prepare(lda_model_m, corpus_m, id2word_m)
lda_viz

  default_term_info = default_term_info.sort_values(


### Women

In [13]:
lda_viz = gensimvis.prepare(lda_model_w, corpus_w, id2word_w)
lda_viz

  default_term_info = default_term_info.sort_values(
