In [2]:
import spacy
import json
import datetime
import ast
import numpy as np
import pandas as pd

from gensim.utils import ClippedCorpus
from gensim.models import Phrases, CoherenceModel
from gensim import corpora, models

from pymongo import MongoClient
from itertools import chain

# Connection to mongo

In [3]:
mongourl = "mongodb://admin:adminpassword@localhost:27017"
mongo_client = MongoClient(mongourl)

collection = mongo_client["news"]["article_es"]
not_processed_docs = collection.find(
    {
        "$or": [
            {"processedEncoding": False},
            {"processedEncoding": {"$exists": False}},
        ]
    }
).limit(5000)

# Pre-processing documents

In [4]:
nlp = spacy.load("es_core_news_md")

In [5]:
'''for word in nlp.Defaults.stop_words:
        if nlp.vocab[word].is_stop:
            print(word)'''

'for word in nlp.Defaults.stop_words:\n        if nlp.vocab[word].is_stop:\n            print(word)'

In [6]:
def fix_stop_words():
    for word in nlp.Defaults.stop_words:
        nlp.vocab[word].is_stop = True
    return

def add_custom_stop_words(custom_stop_words):
    for cw in custom_stop_words:
        nlp.vocab[cw].is_stop = True
    return

def sentence_tokenize(data):
    return [sent for sent in data.sents]

def lemmatize_tokens(data):
    lemmas = []
    for sent in data:
        sent_tokens = []
        for token in sent:
            candidate = token.lemma_.replace("’", "")
            if (not nlp.vocab[candidate].is_stop and not token.is_punct and
                len(candidate) > 1 and not candidate.isspace()):
                sent_tokens.append(candidate)
        lemmas.append(sent_tokens)
        sent_tokens = []
    return lemmas

def flatten_list(data):
    return list(chain.from_iterable(data))

fix_stop_words()

In [7]:
def parse_text(raw_data):
    doc = nlp(raw_data)
    # Retrieve sentences
    sentences = sentence_tokenize(doc)
    # print(len(sentences))
    # Lemmatize + remove stop words
    lemmas = lemmatize_tokens(sentences)
    # print(len(lemmas))
    # Flatten results into a single list
    parsed_text = flatten_list(lemmas)

    return parsed_text

Create list with parsed documents

In [8]:
documents = []
for doc in not_processed_docs:
    parsed_doc = parse_text(doc["text"])
    documents.append(parsed_doc)
print(len(documents))

5000


# LDA utility functions

In [9]:
def get_word_collocations(tokens):
    bigrams = Phrases(tokens)
    trigrams = Phrases(bigrams[tokens], min_count=1)
    return list(trigrams[bigrams[tokens]])

def string_to_list(tokens):
    return ast.literal_eval(tokens)

def save_lda_model(ldaModule, location):
    with open(location + ".pickle", "wb") as output:
        pickle.dump(ldaModule, output, pickle.HIGHEST_PROTOCOL)

def load_lda_model(location):
    with open(location + ".pickle", "rb") as input_file:
        ldaModule = pickle.load(input_file)
    return ldaModule

# LDA Module implementation

In [10]:
def build_dictionary(doc_collection, use_collocations=True, doc_threshold=3):
    if use_collocations:
        doc_collection = get_word_collocations(doc_collection)
    else:
        doc_collection = [string_to_list(t) for t in doc_collection]

    dictionary = corpora.Dictionary(doc_collection)

    if doc_threshold > 0:
        dictionary.filter_extremes(no_below = doc_threshold)

    return dictionary

def build_corpus(doc_collection, dictionary):
    corpus = [dictionary.doc2bow(list_of_tokens) for list_of_tokens in doc_collection]
    return corpus

def build_lda_model(corpus, dictionary, num_topics = 20, passes = 4, alpha = 0.01, eta = 0.01):
    model = models.LdaModel(corpus,
                            num_topics = num_topics,
                            id2word = dictionary,
                            passes = passes,
                            alpha = [alpha] * num_topics,
                            eta = [eta] * len(dictionary.keys()))
    return model

def get_topics(model, corpus, num_docs):
    topics = [model[corpus[i]] for i in range(num_docs)]
    return topics

In [11]:
dictionary = build_dictionary(documents)
corpus = build_corpus(documents, dictionary)

# Topic Coherence

In [11]:
def compute_coherence_values(documents, corpus, dictionary, k, a, b, passes):
    model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = passes,
                            alpha = a,
                            eta = b)
    coherence_model_lda = CoherenceModel(model = model, 
                                         texts = documents, 
                                         dictionary = dictionary, 
                                         coherence='c_v')
    return coherence_model_lda.get_coherence()

In [12]:
# Topics range
min_topics = 2
max_topics = 11
topics_range = range(min_topics, max_topics)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))

In [13]:
num_docs = len(corpus)
corpus_sets = [#ClippedCorpus(corpus, num_docs*0.25), 
               #ClippedCorpus(corpus, num_docs*0.5), 
               ClippedCorpus(corpus, int(num_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']

In [14]:
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [15]:
# LDA model training passes
passes = 5

# iterate through validation corpuses
for i in range(len(corpus_sets)):
    # iterate through number of topics
    print("~"*30)
    print("Starting corpus set {}".format(i))
    print("~"*30)
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(documents, 
                                              corpus = corpus_sets[i], 
                                              dictionary = dictionary, 
                                              k = k, a = a, b = b, passes = passes)
                # Save the model results
                model_results['Validation_Set'].append(corpus_title[i])
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)

pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Starting corpus set 0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Starting corpus set 1
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [16]:
res = pd.read_csv('lda_tuning_results.csv')
res.head()

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.347371
1,75% Corpus,2,0.01,0.31,0.333952
2,75% Corpus,2,0.01,0.61,0.339187
3,75% Corpus,2,0.01,0.91,0.343954
4,75% Corpus,2,0.31,0.01,0.327972


In [17]:
#res['Validation_Set'] = res['Validation_Set'].map(lambda x: 75 if x == "75% Corpus" else 100)
res_75 = res[res['Validation_Set'] == "75% Corpus"]
res_100 = res[res['Validation_Set'] == "100% Corpus"]

In [18]:
res_100[res_100['Topics'] == 5] 
# k = 4, a = 0.91, b = 0.91 -> 0.537146
# k = 5, a = 0.31, b = 0.91 -> 0.521962

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
192,100% Corpus,5,0.01,0.01,0.393238
193,100% Corpus,5,0.01,0.31,0.386772
194,100% Corpus,5,0.01,0.61,0.327432
195,100% Corpus,5,0.01,0.91,0.341929
196,100% Corpus,5,0.31,0.01,0.3844
197,100% Corpus,5,0.31,0.31,0.3699
198,100% Corpus,5,0.31,0.61,0.350325
199,100% Corpus,5,0.31,0.91,0.389145
200,100% Corpus,5,0.61,0.01,0.365475
201,100% Corpus,5,0.61,0.31,0.4048


In [19]:
c = res_100['Coherence']
info_max_c = res_100[res_100['Coherence'] == max(res_100['Coherence'])]
info_max_c

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
231,100% Corpus,7,0.31,0.91,0.418489


In [20]:
for i in range(0, 70, 10):
    passes = i
    if i == 0:
        passes = 1
    cv = compute_coherence_values(documents, 
                              corpus = corpus, 
                              dictionary = dictionary, 
                              k = 5, a = 0.31, b = 0.91, passes = passes)
    print("Coherence @{}: {}".format(passes, cv))

Coherence @1: 0.3288292541563956
Coherence @10: 0.4331378906418883
Coherence @20: 0.41921415429648157
Coherence @30: 0.44862201194927903
Coherence @40: 0.34454830738806413
Coherence @50: 0.45203522385035233
Coherence @60: 0.5890556895211325


In [12]:
k = 5
passes = 20
a = 0.31
b = 0.91

final_lda_model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = passes,
                            alpha = a,
                            eta = b)

In [13]:
final_lda_model.show_topics(formatted=True, num_topics=k, num_words=10)

[(0,
  '0.011*"brotar" + 0.010*"confirmar" + 0.009*"autoridad" + 0.008*"comer" + 0.006*"provincia" + 0.006*"OMS" + 0.006*"entrar" + 0.006*"millón" + 0.006*"informar" + 0.005*"medir"'),
 (1,
  '0.007*"caer" + 0.006*"dólar" + 0.006*"economía" + 0.006*"mercar" + 0.006*"impactar" + 0.005*"económico" + 0.004*"empresa" + 0.004*"ciento" + 0.004*"compañía" + 0.004*"crecimiento"'),
 (2,
  '0.015*"casar" + 0.015*"Salud" + 0.009*"paciente" + 0.008*"síntoma" + 0.007*"confirmar" + 0.007*"llegar" + 0.007*"respiratorio" + 0.007*"informar" + 0.007*"comer" + 0.006*"enfermedad"'),
 (3,
  '0.023*"vuelo" + 0.010*"aerolínea" + 0.010*"suspender" + 0.009*"febrero" + 0.009*"compañía" + 0.007*"Airways" + 0.006*"Airlines" + 0.006*"cancelar" + 0.006*"deber" + 0.006*"suspensión"'),
 (4,
  '0.014*"comer" + 0.010*"animal" + 0.007*"enfermedad" + 0.006*"humano" + 0.006*"SARS" + 0.006*"infectar" + 0.006*"sobrar" + 0.005*"entrar" + 0.005*"mercar" + 0.005*"vacuno"')]

In [14]:
import sys
sys.path.append('../..')
from core_modules.topic_extraction.lda_module import LdaModule
import pickle5 as pickle

In [15]:
module = LdaModule(trained = False)
module.lang = "es"
module.num_topics = k
module.dictionary = dictionary
module.corpus = corpus
module.model = final_lda_model

In [17]:
with open("tuned_models/lda_model_es.pickle", "wb") as output:
    pickle.dump(module, output, pickle.HIGHEST_PROTOCOL)

In [27]:
final_cv = compute_coherence_values(documents, 
                              corpus = corpus, 
                              dictionary = dictionary, 
                              k = k, a = a, b = b, passes = k)
print(final_cv)

  and should_run_async(code)
0.34992222106472093


In [28]:
import pyLDAvis.gensim
import pickle
import pyLDAvis

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(final_lda_model, corpus, dictionary)

LDAvis_prepared

  and should_run_async(code)
