# Topic Modelling
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [1]:
import json
from pathlib import Path
import glob
from nltk.corpus import stopwords

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

  def _figure_formats_changed(self, name, old, new):


In [3]:
stop_words = stopwords.words('german')

## Load Data

In [4]:
records = []
for f in glob.glob(str(Path('../export/tags/*.json'))):
    records = records + json.load(open(f, 'r', encoding='utf-8'))

print(len(records))

46521


In [5]:
# Members only, no Presidents
r_members = list(filter(lambda x: x['ismember'] == True, records))
r_members = list(filter(lambda x: x['funktion'] not in ['Präsidium', '2. Vizepräsidium', '1. Vizepräsidium'], r_members))
print(len(r_members))

34497


## Prepare Data

In [6]:
# Remove Stopwords and only keep noun, adj, vb and adv

keep = ['NN', 'NE',
    'ADJA', 'ADJD',
    'VMFIN', 'VAFIN', 'VVFIN', 'VAIMP', 'VVIMP', 'VVINF', 'VAINF', 'VMINF', 'VVIZU', 'VVPP', 'VMPP', 'VAPP',
    'ADV']

for votum in r_members:
    # Reversed Loop, we will remove items from the list during loop!
    for i in reversed(range(0, len(votum['tags']))):
        tag = votum['tags'][i]
        if tag[0].lower() in stop_words:

            # Stopword found. Remove
            votum['tags'].remove(tag)

        elif tag[2] not in keep:
            
            # Not tag we need. Remove
            votum['tags'].remove(tag)

In [7]:
# Stick the words together to get a nice list for our corpora and split by sex
data_m = []
data_w = []
for votum in r_members:
    if votum['geschlecht'] == 'm':
        data_m.append(list(map(lambda tag: tag[0], votum['tags'])))
    else:
        data_w.append(list(map(lambda tag: tag[0], votum['tags'])))

# Build Dictionary
id2word_m = corpora.Dictionary(data_m)
id2word_w = corpora.Dictionary(data_w)

# Term Document Frequency
corpus_m = [id2word_m.doc2bow(text) for text in data_m]
corpus_w = [id2word_w.doc2bow(text) for text in data_w]

## Go LDA

In [7]:
lda_model_m = gensim.models.ldamodel.LdaModel(corpus=corpus_m, id2word=id2word_m, num_topics=20, random_state=100, update_every=1,
                                            chunksize=100, passes=10, alpha='auto', per_word_topics=True)

KeyboardInterrupt: 

In [None]:
lda_model_w = gensim.models.ldamodel.LdaModel(corpus=corpus_w, id2word=id2word_w, num_topics=20, random_state=100, update_every=1,
                                            chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [8]:
# Store them
lda_model_m.save(str(Path('../export/models/lda_m.model')))
lda_model_w.save(str(Path('../export/models/lda_w.model')))

NameError: name 'lda_model_m' is not defined

## Load models, if needed

In [14]:
# If needed...
lda_model_m = gensim.models.LdaModel.load(str(Path('../export/models/lda_m.model')))
lda_model_w = gensim.models.LdaModel.load(str(Path('../export/models/lda_w.model')))

## Men

In [10]:
pyLDAvis.enable_notebook()

In [11]:
lda_viz = gensimvis.prepare(lda_model_m, corpus_m, id2word_m)
lda_viz

  default_term_info = default_term_info.sort_values(


## Women

In [12]:
lda_viz = gensimvis.prepare(lda_model_w, corpus_w, id2word_w)
lda_viz

  default_term_info = default_term_info.sort_values(
