In [1]:
import spacy
import json
import datetime
import ast
import numpy as np
import pandas as pd

from gensim.utils import ClippedCorpus
from gensim.models import Phrases, CoherenceModel
from gensim import corpora, models

from pymongo import MongoClient
from itertools import chain

# Connection to mongo

In [24]:
mongourl = "mongodb://admin:adminpassword@localhost:27017"
mongo_client = MongoClient(mongourl)

collection = mongo_client["news"]["article_nl"]
not_processed_docs = collection.find(
    {
        "$or": [
            {"processedEncoding": False},
            {"processedEncoding": {"$exists": False}},
        ]
    }
).limit(2000)

# Pre-processing documents

In [25]:
nlp = spacy.load("nl_core_news_md")

In [26]:
for word in nlp.Defaults.stop_words:
        if nlp.vocab[word].is_stop:
            print(word)

mede
boven
sindsdien
erdoor
toe
zodra
sinds
gemogen
vooralsnog
juist
tamelijk
anderen
inmiddels
ander
afgelopen
uwe
wier
of
vooruit
hoe
gedurende
nabij
elk
bovendien
verder
moest
die
hem
alleen
vanuit
verre
publ
eveneens
zijnde
allen
geheel
mijzelf
tenzij
op
aan
niet
omdat
nadat
na
vanaf
voorts
achter
dezelfde
zelfde
door
omtrent
daarop
precies
sommige
hierbeneden
onszelf
ikke
mag
wil
effe
dan
nu
totdat
kon
zeker
even
geleden
beneden
zo
gegeven
al
want
gewoon
ooit
weg
welken
hele
daarna
dien
als
klaar
zulke
meer
andere
bijvoorbeeld
bepaald
ja
eerste
rondom
wanneer
naar
opnieuw
wel
voorheen
zo’n
later
mij
had
zonder
betere
vroeg
deze
hier
zullen
uw
dit
echter
enkel
vooral
zou
ondertussen
vgl
mogen
mochten
daarom
omlaag
hare
kunt
dat
geweest
en
werden
heeft
waar
zulks
weinig
prof
met
wij
toch
geven
is
binnen
nogal
was
wordt
om
een
krachtens
gewoonweg
misschien
hierin
vanwege
mezelf
spoedig
beter
liever
onder
steeds
kan
wiens
zichzelf
te
zekere
weldra
wegens
uwen
moet
behalve
voordat
tuss

In [27]:
def fix_stop_words():
    for word in nlp.Defaults.stop_words:
        nlp.vocab[word].is_stop = True
    return

def add_custom_stop_words(custom_stop_words):
    for cw in custom_stop_words:
        nlp.vocab[cw].is_stop = True
    return

def sentence_tokenize(data):
    return [sent for sent in data.sents]

def lemmatize_tokens(data):
    lemmas = []
    for sent in data:
        sent_tokens = []
        for token in sent:
            candidate = token.lemma_.replace("’", "")
            if (not nlp.vocab[candidate].is_stop and not token.is_punct and
                len(candidate) > 1 and not candidate.isspace()):
                sent_tokens.append(candidate)
        lemmas.append(sent_tokens)
        sent_tokens = []
    return lemmas

def flatten_list(data):
    return list(chain.from_iterable(data))

fix_stop_words()

In [28]:
def parse_text(raw_data):
    doc = nlp(raw_data)
    # Retrieve sentences
    sentences = sentence_tokenize(doc)
    # print(len(sentences))
    # Lemmatize + remove stop words
    lemmas = lemmatize_tokens(sentences)
    # print(len(lemmas))
    # Flatten results into a single list
    parsed_text = flatten_list(lemmas)

    return parsed_text

Create list with parsed documents

In [29]:
documents = []
for doc in not_processed_docs:
    parsed_doc = parse_text(doc["text"])
    documents.append(parsed_doc)

# LDA utility functions

In [30]:
def get_word_collocations(tokens):
    bigrams = Phrases(tokens)
    trigrams = Phrases(bigrams[tokens], min_count=1)
    return list(trigrams[bigrams[tokens]])

def string_to_list(tokens):
    return ast.literal_eval(tokens)

def save_lda_model(ldaModule, location):
    with open(location + ".pickle", "wb") as output:
        pickle.dump(ldaModule, output, pickle.HIGHEST_PROTOCOL)

def load_lda_model(location):
    with open(location + ".pickle", "rb") as input_file:
        ldaModule = pickle.load(input_file)
    return ldaModule

# LDA Module implementation

In [31]:
def build_dictionary(doc_collection, use_collocations=True, doc_threshold=3):
    if use_collocations:
        doc_collection = get_word_collocations(doc_collection)
    else:
        doc_collection = [string_to_list(t) for t in doc_collection]

    dictionary = corpora.Dictionary(doc_collection)

    if doc_threshold > 0:
        dictionary.filter_extremes(no_below = doc_threshold)

    return dictionary

def build_corpus(doc_collection, dictionary):
    corpus = [dictionary.doc2bow(list_of_tokens) for list_of_tokens in doc_collection]
    return corpus

def build_lda_model(corpus, dictionary, num_topics = 20, passes = 4, alpha = 0.01, eta = 0.01):
    model = models.LdaModel(corpus,
                            num_topics = num_topics,
                            id2word = dictionary,
                            passes = passes,
                            alpha = [alpha] * num_topics,
                            eta = [eta] * len(dictionary.keys()))
    return model

def get_topics(model, corpus, num_docs):
    topics = [model[corpus[i]] for i in range(num_docs)]
    return topics

In [32]:
dictionary = build_dictionary(documents)
corpus = build_corpus(documents, dictionary)

# Topic Coherence

In [33]:
def compute_coherence_values(documents, corpus, dictionary, k, a, b, passes):
    model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = passes,
                            alpha = a,
                            eta = b)
    coherence_model_lda = CoherenceModel(model = model, 
                                         texts = documents, 
                                         dictionary = dictionary, 
                                         coherence='c_v')
    return coherence_model_lda.get_coherence()

In [34]:
# Topics range
min_topics = 2
max_topics = 11
topics_range = range(min_topics, max_topics)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))

In [35]:
num_docs = len(corpus)
corpus_sets = [#ClippedCorpus(corpus, num_docs*0.25), 
               #ClippedCorpus(corpus, num_docs*0.5), 
               ClippedCorpus(corpus, int(num_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']

In [36]:
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [37]:
# LDA model training passes
passes = 5

# iterate through validation corpuses
for i in range(len(corpus_sets)):
    # iterate through number of topics
    print("~"*30)
    print("Starting corpus set {}".format(i))
    print("~"*30)
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(documents, 
                                              corpus = corpus_sets[i], 
                                              dictionary = dictionary, 
                                              k = k, a = a, b = b, passes = passes)
                # Save the model results
                model_results['Validation_Set'].append(corpus_title[i])
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)

pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Starting corpus set 0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Starting corpus set 1
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [38]:
res = pd.read_csv('lda_tuning_results.csv')
res.head()

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.622299
1,75% Corpus,2,0.01,0.31,0.893531
2,75% Corpus,2,0.01,0.61,0.631271
3,75% Corpus,2,0.01,0.91,0.621133
4,75% Corpus,2,0.31,0.01,0.619181


In [39]:
#res['Validation_Set'] = res['Validation_Set'].map(lambda x: 75 if x == "75% Corpus" else 100)
res_75 = res[res['Validation_Set'] == "75% Corpus"]
res_100 = res[res['Validation_Set'] == "100% Corpus"]

In [40]:
res_100[res_100['Topics'] == 5] 
# k = 4, a = 0.91, b = 0.91 -> 0.537146
# k = 5, a = 0.31, b = 0.91 -> 0.521962

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
192,100% Corpus,5,0.01,0.01,0.556172
193,100% Corpus,5,0.01,0.31,0.466264
194,100% Corpus,5,0.01,0.61,0.593936
195,100% Corpus,5,0.01,0.91,0.474216
196,100% Corpus,5,0.31,0.01,0.837624
197,100% Corpus,5,0.31,0.31,0.582651
198,100% Corpus,5,0.31,0.61,0.69592
199,100% Corpus,5,0.31,0.91,0.597254
200,100% Corpus,5,0.61,0.01,0.709315
201,100% Corpus,5,0.61,0.31,0.698091


In [41]:
c = res_100['Coherence']
info_max_c = res_100[res_100['Coherence'] == max(res_100['Coherence'])]
info_max_c

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
238,100% Corpus,7,0.91,0.61,0.866113


In [42]:
for i in range(0, 70, 10):
    passes = i
    if i == 0:
        passes = 1
    cv = compute_coherence_values(documents, 
                              corpus = corpus, 
                              dictionary = dictionary, 
                              k = 5, a = 0.31, b = 0.01, passes = passes)
    print("Coherence @{}: {}".format(passes, cv))

Coherence @1: 0.8201182761426324
Coherence @10: 0.709113028816256
Coherence @20: 0.5864714063151694
Coherence @30: 0.5860337716911111
Coherence @40: 0.5908862420556753
Coherence @50: 0.7217586520605487
Coherence @60: 0.6084338427911781


In [43]:
k = 5
passes = 10
a = 0.31
b = 0.01

final_lda_model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = passes,
                            alpha = a,
                            eta = b)

In [44]:
final_lda_model.show_topics(formatted=True, num_topics=k, num_words=10)

[(0,
  '0.014*"gaan" + 0.013*"mens" + 0.012*"komen" + 0.010*"coronaviru" + 0.009*"zeggen" + 0.009*"maken" + 0.008*"zien" + 0.007*"heel" + 0.007*"werknemer" + 0.007*"groot"'),
 (1,
  '0.024*"coronaviru" + 0.016*"virus" + 0.016*"mens" + 0.014*"gaan" + 0.012*"china" + 0.009*"aantal" + 0.009*"ziekenhuis" + 0.009*"patiënt" + 0.009*"nieuw" + 0.008*"besmetting"'),
 (2,
  '0.044*"universiteit" + 0.044*"bosch" + 0.043*"covid-19" + 0.043*"cel" + 0.043*"utrecht" + 0.022*"uitbreken" + 0.022*"virus" + 0.022*"voorkomen" + 0.022*"ziekte" + 0.022*"verspreiden"'),
 (3,
  '0.033*"covid-19" + 0.032*"utrecht" + 0.032*"waarschuwen" + 0.030*"bosch" + 0.030*"universiteit" + 0.029*"cel" + 0.022*"gaan" + 0.021*"zaak" + 0.021*"voordeel" + 0.020*"schrijven"'),
 (4,
  '0.037*"coronaviru" + 0.031*"onderzoeker" + 0.025*"covid-19" + 0.020*"virus" + 0.017*"nieuw" + 0.017*"cel" + 0.015*"trein" + 0.014*"arriva" + 0.014*"ziekte" + 0.012*"komen"')]

In [45]:
final_cv = compute_coherence_values(documents, 
                              corpus = corpus, 
                              dictionary = dictionary, 
                              k = k, a = a, b = b, passes = k)
print(final_cv)

0.43128200248726706


In [46]:
import pyLDAvis.gensim
import pickle
import pyLDAvis

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(final_lda_model, corpus, dictionary)

LDAvis_prepared