In [42]:
import spacy
import json
import datetime
import ast
import numpy as np
import pandas as pd

from gensim.utils import ClippedCorpus
from gensim.models import Phrases, CoherenceModel
from gensim import corpora, models

from pymongo import MongoClient
from itertools import chain

  and should_run_async(code)


# Connection to mongo

In [43]:
mongourl = "mongodb://admin:adminpassword@localhost:27017"
mongo_client = MongoClient(mongourl)

collection = mongo_client["news"]["article"]
not_processed_docs = collection.find(
    {
        "$or": [
            {"processedEncoding": False},
            {"processedEncoding": {"$exists": False}},
        ]
    }
).limit(2000)

  and should_run_async(code)


# Pre-processing documents

In [44]:
nlp = spacy.load("it_core_news_md")

  and should_run_async(code)


In [4]:
for word in nlp.Defaults.stop_words:
        if nlp.vocab[word].is_stop:
            print(word)

ministro
quelle
meno
dove
piuttosto
stai
facesse
avrei
fossimo
parte
agli
giorno
abbiate
glieli
nostri
cos
modo
generale
più
ma
trovato
tuo
lui
maggior
un
negl
dagli
adesso
stiano
lei
altrui
mosto
ed
foste
infatti
fin
avendo
nessuno
dai
miei
verso
due
ansa
preferibilmente
affinche
qualcuna
questa
feci
detto
uomo
alle
facciano
quel
fossero
ex
stessimo
momento
questi
eri
qualcosa
diventa
ultimo
sopra
esempio
eravate
dentro
medesimo
stessero
favore
state
anni
facendo
sue
stato
mi
starete
talvolta
suo
alla
faranno
primo
allo
ne
va
farai
sia
ha
po
sua
mio
potrebbe
gia
sarete
fecero
facevo
così
codesto
durante
avevate
dovra
cortesia
forza
haha
avevo
poco
avemmo
può
essi
facessimo
quantunque
qualche
quest
ognuno
dunque
quanti
presa
nei
ero
conclusione
nove
abbia
fanno
malgrado
ecc
avessi
successivo
fai
fine
srl
sembri
mila
nella
grazie
avrai
me
mia
ho
piedi
si
nostro
staresti
siete
coloro
milioni
varia
prima
faceva
cui
facevamo
facevi
puo
glielo
avremmo
negli
fece
non
coll
moltissimo
press
po

In [45]:
def fix_stop_words():
    for word in nlp.Defaults.stop_words:
        nlp.vocab[word].is_stop = True
    return

def add_custom_stop_words(custom_stop_words):
    for cw in custom_stop_words:
        nlp.vocab[cw].is_stop = True
    return

def sentence_tokenize(data):
    return [sent for sent in data.sents]

def lemmatize_tokens(data):
    lemmas = []
    for sent in data:
        sent_tokens = []
        for token in sent:
            
            candidate = token.lemma_.replace("’", "")
            if (not nlp.vocab[candidate].is_stop and not token.is_punct and
                len(candidate) > 1 and not candidate.isspace()):
                sent_tokens.append(candidate)
        lemmas.append(sent_tokens)
        sent_tokens = []
    return lemmas

def flatten_list(data):
    return list(chain.from_iterable(data))

fix_stop_words()

  and should_run_async(code)


In [46]:
def parse_text(raw_data):
    doc = nlp(raw_data)
    # Retrieve sentences
    sentences = sentence_tokenize(doc)
    # print(len(sentences))
    # Lemmatize + remove stop words
    lemmas = lemmatize_tokens(sentences)
    # print(len(lemmas))
    # Flatten results into a single list
    parsed_text = flatten_list(lemmas)

    return parsed_text

  and should_run_async(code)


Create list with parsed documents

In [47]:
documents = []
for doc in not_processed_docs:
    parsed_doc = parse_text(doc["text"])
    documents.append(parsed_doc)

  and should_run_async(code)


# LDA utility functions

In [48]:
def get_word_collocations(tokens):
    bigrams = Phrases(tokens)
    trigrams = Phrases(bigrams[tokens], min_count=1)
    return list(trigrams[bigrams[tokens]])

def string_to_list(tokens):
    return ast.literal_eval(tokens)

def save_lda_model(ldaModule, location):
    with open(location + ".pickle", "wb") as output:
        pickle.dump(ldaModule, output, pickle.HIGHEST_PROTOCOL)

def load_lda_model(location):
    with open(location + ".pickle", "rb") as input_file:
        ldaModule = pickle.load(input_file)
    return ldaModule

  and should_run_async(code)


# LDA Module implementation

In [49]:
def build_dictionary(doc_collection, use_collocations=True, doc_threshold=3):
    if use_collocations:
        doc_collection = get_word_collocations(doc_collection)
    else:
        doc_collection = [string_to_list(t) for t in doc_collection]

    dictionary = corpora.Dictionary(doc_collection)

    if doc_threshold > 0:
        dictionary.filter_extremes(no_below = doc_threshold)

    return dictionary

def build_corpus(doc_collection, dictionary):
    corpus = [dictionary.doc2bow(list_of_tokens) for list_of_tokens in doc_collection]
    return corpus

def build_lda_model(corpus, dictionary, num_topics = 20, passes = 4, alpha = 0.01, eta = 0.01):
    model = models.LdaModel(corpus,
                            num_topics = num_topics,
                            id2word = dictionary,
                            passes = passes,
                            alpha = [alpha] * num_topics,
                            eta = [eta] * len(dictionary.keys()))
    return model

def get_topics(model, corpus, num_docs):
    topics = [model[corpus[i]] for i in range(num_docs)]
    return topics

  and should_run_async(code)


In [50]:
dictionary = build_dictionary(documents)
corpus = build_corpus(documents, dictionary)

  and should_run_async(code)


# Topic Coherence

In [51]:
def compute_coherence_values(documents, corpus, dictionary, k, a, b, passes):
    model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = passes,
                            alpha = a,
                            eta = b)
    coherence_model_lda = CoherenceModel(model = model, 
                                         texts = documents, 
                                         dictionary = dictionary, 
                                         coherence='c_v')
    return coherence_model_lda.get_coherence()

  and should_run_async(code)


In [52]:
# Topics range
min_topics = 2
max_topics = 11
topics_range = range(min_topics, max_topics)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))

  and should_run_async(code)


In [53]:
num_docs = len(corpus)
corpus_sets = [#ClippedCorpus(corpus, num_docs*0.25), 
               #ClippedCorpus(corpus, num_docs*0.5), 
               ClippedCorpus(corpus, int(num_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']

  and should_run_async(code)


In [54]:
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

  and should_run_async(code)


In [57]:
# LDA model training passes
passes = 5

# iterate through validation corpuses
for i in range(len(corpus_sets)):
    # iterate through number of topics
    print("~"*30)
    print("Starting corpus set {}".format(i))
    print("~"*30)
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(documents, 
                                              corpus = corpus_sets[i], 
                                              dictionary = dictionary, 
                                              k = k, a = a, b = b, passes = passes)
                # Save the model results
                model_results['Validation_Set'].append(corpus_title[i])
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)

pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)

  and should_run_async(code)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Starting corpus set 0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Starting corpus set 1
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [58]:
res = pd.read_csv('lda_tuning_results.csv')
res.head()

  and should_run_async(code)


Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.454546
1,75% Corpus,2,0.01,0.31,0.503412
2,75% Corpus,2,0.01,0.61,0.476131
3,75% Corpus,2,0.01,0.91,0.495424
4,75% Corpus,2,0.31,0.01,0.475585


In [59]:
#res['Validation_Set'] = res['Validation_Set'].map(lambda x: 75 if x == "75% Corpus" else 100)
res_75 = res[res['Validation_Set'] == "75% Corpus"]
res_100 = res[res['Validation_Set'] == "100% Corpus"]

  and should_run_async(code)


In [63]:
res_100[res_100['Topics'] == 5] 
# k = 4, a = 0.91, b = 0.91 -> 0.537146
# k = 5, a = 0.31, b = 0.91 -> 0.521962

  and should_run_async(code)


Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
192,100% Corpus,5,0.01,0.01,0.436611
193,100% Corpus,5,0.01,0.31,0.430075
194,100% Corpus,5,0.01,0.61,0.427001
195,100% Corpus,5,0.01,0.91,0.501333
196,100% Corpus,5,0.31,0.01,0.479511
197,100% Corpus,5,0.31,0.31,0.495554
198,100% Corpus,5,0.31,0.61,0.512024
199,100% Corpus,5,0.31,0.91,0.521962
200,100% Corpus,5,0.61,0.01,0.44858
201,100% Corpus,5,0.61,0.31,0.487326


In [60]:
c = res_100['Coherence']
info_max_c = res_100[res_100['Coherence'] == max(res_100['Coherence'])]
info_max_c

  and should_run_async(code)


Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
228,100% Corpus,7,0.31,0.01,0.549417


In [65]:
for i in range(0, 70, 10):
    passes = i
    if i == 0:
        passes = 1
    cv = compute_coherence_values(documents, 
                              corpus = corpus, 
                              dictionary = dictionary, 
                              k = 5, a = 0.31, b = 0.31, passes = passes)
    print("Coherence @{}: {}".format(passes, cv))

  and should_run_async(code)
Coherence @1: 0.5170639182726811
Coherence @10: 0.44685108459433887
Coherence @20: 0.5339450559617924
Coherence @30: 0.4981876836002327
Coherence @40: 0.4986764609749296
Coherence @50: 0.5347984182736576
Coherence @60: 0.5152794540159465


In [71]:
k = 5
passes = 20
a = 0.31
b = 0.31

final_lda_model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = passes,
                            alpha = a,
                            eta = b)

  and should_run_async(code)


In [72]:
final_lda_model.show_topics(formatted=True, num_topics=k, num_words=10)

  and should_run_async(code)


[(0,
  '0.032*"rispettare" + 0.017*"persona" + 0.014*"regione" + 0.014*"Toscana" + 0.014*"residente" + 0.012*"x100.000" + 0.011*"provincia" + 0.010*"Lucca" + 0.009*"Firenze" + 0.009*"guarire"'),
 (1,
  '0.018*"isolamento" + 0.018*"positivo" + 0.017*"Asl" + 0.016*"Sardegna" + 0.016*"domiciliare" + 0.015*"link" + 0.014*"dio" + 0.013*"rientrare" + 0.011*"Roma" + 0.010*"persona"'),
 (2,
  '0.015*"tampone" + 0.013*"positivo" + 0.012*"sintomatico" + 0.010*"Berlusconi" + 0.010*"paziente" + 0.009*"sintomo" + 0.009*"stare" + 0.009*"rientrare" + 0.009*"isolamento" + 0.008*"risultare"'),
 (3,
  '0.014*"stare" + 0.013*"dio" + 0.011*"potere" + 0.007*"dovere" + 0.007*"scuola" + 0.005*"mascherina" + 0.005*"sanitario" + 0.005*"tampone" + 0.005*"partire" + 0.004*"test"'),
 (4,
  '0.028*"positivo" + 0.021*"dio" + 0.018*"tampone" + 0.015*"totale" + 0.015*"registrare" + 0.014*"persona" + 0.013*"numerare" + 0.012*"provincia" + 0.011*"24" + 0.011*"rispettare"')]

In [73]:
final_cv = compute_coherence_values(documents, 
                              corpus = corpus, 
                              dictionary = dictionary, 
                              k = k, a = a, b = b, passes = k)
print(final_cv)

  and should_run_async(code)
0.4812439532241896


In [74]:
import pyLDAvis.gensim
import pickle
import pyLDAvis

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(final_lda_model, corpus, dictionary)

LDAvis_prepared

  and should_run_async(code)
