In [1]:
import spacy
import json
import datetime
import ast
import numpy as np
import pandas as pd

from gensim.utils import ClippedCorpus
from gensim.models import Phrases, CoherenceModel
from gensim import corpora, models

from pymongo import MongoClient
from itertools import chain

# Connection to mongo

In [2]:
mongourl = "mongodb://admin:adminpassword@localhost:27017"
mongo_client = MongoClient(mongourl)

collection = mongo_client["news"]["article_fr"]
not_processed_docs = collection.find(
    {
        "$or": [
            {"processedEncoding": False},
            {"processedEncoding": {"$exists": False}},
        ]
    }
).limit(2000)

# Pre-processing documents

In [3]:
nlp = spacy.load("fr_core_news_md")

In [4]:
for word in nlp.Defaults.stop_words:
        if nlp.vocab[word].is_stop:
            print(word)

aux
ceci
dans
tardive
longtemps
onzième
suivant
celles-là
donc
autres
devant
toutes
cinquième
quel
reste
au
seize
certain
tien
quarante
deux
delà
strictement
excepté
seul
rendre
allo
où
serait
notamment
une
c’
vingt
plouf
etant
gens
pfut
divers
elle
anterieure
chère
envers
j’
toutefois
trente
devra
six
possibles
puisque
moindres
es
fait
me
couic
exterieur
abord
merci
suffisant
celui
ceux-là
passé
plein
ès
ollé
pure
flac
suis
toc
sept
autrefois
i
certaines
seraient
douzième
clac
vôtres
ils
dix-sept
derniere
sein
directement
moins
prealable
tic
ton
effet
dring
tout
cinq
ont
cinquantaine
devers
hem
dernier
m'
hep
nombreuses
bah
cent
environ
tellement
clic
deja
elle-même
aucun
extenso
particulier
uns
avec
avaient
pouvait
moyennant
ouste
c'
cependant
vifs
dit
vu
necessairement
pres
quant
sinon
uniformement
sera
telles
aura
neuf
dessus
certes
ma
quiconque
sans
mille
lès
douze
nous-mêmes
premier
restant
bat
à
procedant
du
dix-huit
suffisante
ailleurs
pu
les
va
tsouin
première
plus
ah
pour
plu

In [5]:
def fix_stop_words():
    for word in nlp.Defaults.stop_words:
        nlp.vocab[word].is_stop = True
    return

def add_custom_stop_words(custom_stop_words):
    for cw in custom_stop_words:
        nlp.vocab[cw].is_stop = True
    return

def sentence_tokenize(data):
    return [sent for sent in data.sents]

def lemmatize_tokens(data):
    lemmas = []
    for sent in data:
        sent_tokens = []
        for token in sent:
            candidate = token.lemma_.replace("’", "")
            if (not nlp.vocab[candidate].is_stop and not token.is_punct and
                len(candidate) > 1 and not candidate.isspace()):
                sent_tokens.append(candidate)
        lemmas.append(sent_tokens)
        sent_tokens = []
    return lemmas

def flatten_list(data):
    return list(chain.from_iterable(data))

fix_stop_words()

In [6]:
def parse_text(raw_data):
    doc = nlp(raw_data)
    # Retrieve sentences
    sentences = sentence_tokenize(doc)
    # print(len(sentences))
    # Lemmatize + remove stop words
    lemmas = lemmatize_tokens(sentences)
    # print(len(lemmas))
    # Flatten results into a single list
    parsed_text = flatten_list(lemmas)

    return parsed_text

Create list with parsed documents

In [7]:
documents = []
for doc in not_processed_docs:
    parsed_doc = parse_text(doc["text"])
    documents.append(parsed_doc)

# LDA utility functions

In [8]:
def get_word_collocations(tokens):
    bigrams = Phrases(tokens)
    trigrams = Phrases(bigrams[tokens], min_count=1)
    return list(trigrams[bigrams[tokens]])

def string_to_list(tokens):
    return ast.literal_eval(tokens)

def save_lda_model(ldaModule, location):
    with open(location + ".pickle", "wb") as output:
        pickle.dump(ldaModule, output, pickle.HIGHEST_PROTOCOL)

def load_lda_model(location):
    with open(location + ".pickle", "rb") as input_file:
        ldaModule = pickle.load(input_file)
    return ldaModule

# LDA Module implementation

In [9]:
def build_dictionary(doc_collection, use_collocations=True, doc_threshold=3):
    if use_collocations:
        doc_collection = get_word_collocations(doc_collection)
    else:
        doc_collection = [string_to_list(t) for t in doc_collection]

    dictionary = corpora.Dictionary(doc_collection)

    if doc_threshold > 0:
        dictionary.filter_extremes(no_below = doc_threshold)

    return dictionary

def build_corpus(doc_collection, dictionary):
    corpus = [dictionary.doc2bow(list_of_tokens) for list_of_tokens in doc_collection]
    return corpus

def build_lda_model(corpus, dictionary, num_topics = 20, passes = 4, alpha = 0.01, eta = 0.01):
    model = models.LdaModel(corpus,
                            num_topics = num_topics,
                            id2word = dictionary,
                            passes = passes,
                            alpha = [alpha] * num_topics,
                            eta = [eta] * len(dictionary.keys()))
    return model

def get_topics(model, corpus, num_docs):
    topics = [model[corpus[i]] for i in range(num_docs)]
    return topics

In [10]:
dictionary = build_dictionary(documents)
corpus = build_corpus(documents, dictionary)

# Topic Coherence

In [11]:
def compute_coherence_values(documents, corpus, dictionary, k, a, b, passes):
    model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = passes,
                            alpha = a,
                            eta = b)
    coherence_model_lda = CoherenceModel(model = model, 
                                         texts = documents, 
                                         dictionary = dictionary, 
                                         coherence='c_v')
    return coherence_model_lda.get_coherence()

In [12]:
# Topics range
min_topics = 2
max_topics = 11
topics_range = range(min_topics, max_topics)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))

In [13]:
num_docs = len(corpus)
corpus_sets = [#ClippedCorpus(corpus, num_docs*0.25), 
               #ClippedCorpus(corpus, num_docs*0.5), 
               ClippedCorpus(corpus, int(num_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']

In [14]:
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [15]:
# LDA model training passes
passes = 5

# iterate through validation corpuses
for i in range(len(corpus_sets)):
    # iterate through number of topics
    print("~"*30)
    print("Starting corpus set {}".format(i))
    print("~"*30)
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(documents, 
                                              corpus = corpus_sets[i], 
                                              dictionary = dictionary, 
                                              k = k, a = a, b = b, passes = passes)
                # Save the model results
                model_results['Validation_Set'].append(corpus_title[i])
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)

pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Starting corpus set 0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Starting corpus set 1
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [16]:
res = pd.read_csv('lda_tuning_results.csv')
res.head()

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.425948
1,75% Corpus,2,0.01,0.31,0.390175
2,75% Corpus,2,0.01,0.61,0.452736
3,75% Corpus,2,0.01,0.91,0.410218
4,75% Corpus,2,0.31,0.01,0.39253


In [17]:
#res['Validation_Set'] = res['Validation_Set'].map(lambda x: 75 if x == "75% Corpus" else 100)
res_75 = res[res['Validation_Set'] == "75% Corpus"]
res_100 = res[res['Validation_Set'] == "100% Corpus"]

In [18]:
res_100[res_100['Topics'] == 5] 
# k = 4, a = 0.91, b = 0.91 -> 0.537146
# k = 5, a = 0.31, b = 0.91 -> 0.521962

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
192,100% Corpus,5,0.01,0.01,0.372237
193,100% Corpus,5,0.01,0.31,0.36278
194,100% Corpus,5,0.01,0.61,0.44164
195,100% Corpus,5,0.01,0.91,0.428051
196,100% Corpus,5,0.31,0.01,0.413333
197,100% Corpus,5,0.31,0.31,0.436297
198,100% Corpus,5,0.31,0.61,0.454369
199,100% Corpus,5,0.31,0.91,0.391445
200,100% Corpus,5,0.61,0.01,0.434821
201,100% Corpus,5,0.61,0.31,0.393515


In [19]:
c = res_100['Coherence']
info_max_c = res_100[res_100['Coherence'] == max(res_100['Coherence'])]
info_max_c

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
247,100% Corpus,8,0.31,0.91,0.494134


In [20]:
for i in range(0, 70, 10):
    passes = i
    if i == 0:
        passes = 1
    cv = compute_coherence_values(documents, 
                              corpus = corpus, 
                              dictionary = dictionary, 
                              k = 5, a = 0.91, b = 0.31, passes = passes)
    print("Coherence @{}: {}".format(passes, cv))

Coherence @1: 0.366521324006979
Coherence @10: 0.4685323298057658
Coherence @20: 0.4016460150956833
Coherence @30: 0.47353026968068884
Coherence @40: 0.49667817556279575
Coherence @50: 0.4632589146654107
Coherence @60: 0.40510212551790536


In [21]:
k = 5
passes = 40
a = 0.91
b = 0.31

final_lda_model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = passes,
                            alpha = a,
                            eta = b)

In [22]:
final_lda_model.show_topics(formatted=True, num_topics=k, num_words=10)

[(0,
  '0.026*"joueur" + 0.018*"positif" + 0.015*"club" + 0.013*"cas" + 0.011*"test" + 0.010*"match" + 0.010*"devoir" + 0.010*"jour" + 0.010*"septembre" + 0.009*"protocole"'),
 (1,
  '0.012*"pouvoir" + 0.010*"faire" + 0.010*"virus" + 0.008*"covid-19" + 0.008*"devoir" + 0.007*"grand" + 0.007*"qu" + 0.007*"coronavirus" + 0.005*"test" + 0.005*"étude"'),
 (2,
  '0.010*"vaccin" + 0.009*"an" + 0.008*"devoir" + 0.007*"vol" + 0.007*"covid-19" + 0.007*"coronavirus" + 0.006*"année" + 0.006*"euro" + 0.006*"qu" + 0.006*"autorité"'),
 (3,
  '0.014*"masque" + 0.013*"pouvoir" + 0.011*"faire" + 0.009*"école" + 0.009*"devoir" + 0.009*"enfant" + 0.008*"test" + 0.007*"élève" + 0.006*"qu" + 0.006*"cas"'),
 (4,
  '0.037*"cas" + 0.015*"coronavirus" + 0.014*"décès" + 0.014*"nombre" + 0.013*"pays" + 0.011*"contamination" + 0.010*"habitant" + 0.010*"lundi" + 0.010*"jour" + 0.009*"faire"')]

In [23]:
final_cv = compute_coherence_values(documents, 
                              corpus = corpus, 
                              dictionary = dictionary, 
                              k = k, a = a, b = b, passes = k)
print(final_cv)

0.45371439433949384


In [24]:
import pyLDAvis.gensim
import pickle
import pyLDAvis

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(final_lda_model, corpus, dictionary)

LDAvis_prepared