In [54]:
import spacy
import json
import ast
import numpy as np
import pandas as pd

from gensim.utils import ClippedCorpus
from gensim.models import Phrases, CoherenceModel
from gensim import corpora, models

from pymongo import MongoClient
from itertools import chain
from datetime import datetime

  and should_run_async(code)


# Connection to mongo

In [56]:
mongourl = "mongodb://admin:adminpassword@localhost:27017"
MONGO_CLIENT = MongoClient(mongourl)

START_YEAR = 2020
START_MONTH = 2
END_YEAR = 2020
END_MONTH = 3
START = datetime(START_YEAR, START_MONTH, 1, 0, 0)
END = datetime(END_YEAR, END_MONTH, 1, 0, 0)

  and should_run_async(code)


In [57]:
def db_news_extraction(lang, query, limit=10):
    if lang != "it":
        name_coll = "article_" + lang
    else:
        name_coll = "article"
    collection = MONGO_CLIENT["news"][name_coll]

    not_processed_docs = collection.find(query).limit(5000)
    return collection, not_processed_docs

  and should_run_async(code)


In [58]:
def build_query():
    '''
    q = {
        "$and": [
            {"discoverDate": {"$gte": START, "$lt": END}},
            {"bertEncoding": {"$exists": True}},
            {"$where": "this.bertEncoding.length > 0"},
        ]
    }
    '''
    q = {"discoverDate": {"$gte": START, "$lt": END}}
    return q

  and should_run_async(code)


In [59]:
def update_dates(self):
    START_MONTH += 1
    END_MONTH += 1
    if START_MONTH == 13:
        START_MONTH = 1
        START_YEAR += 1
    if END_MONTH == 13:
        END_MONTH = 1
        END_YEAR += 1
    START = datetime(START_YEAR, START_MONTH, 1, 0, 0)
    END = datetime(END_YEAR, END_MONTH, 1, 0, 0)

  and should_run_async(code)


# Pre-processing documents

In [60]:
nlp = spacy.load("it_core_news_md")

  and should_run_async(code)


In [None]:
'''for word in nlp.Defaults.stop_words:
        if nlp.vocab[word].is_stop:
            print(word)'''

In [61]:
def fix_stop_words():
    for word in nlp.Defaults.stop_words:
        nlp.vocab[word].is_stop = True
    return

def add_custom_stop_words(custom_stop_words):
    for cw in custom_stop_words:
        nlp.vocab[cw].is_stop = True
    return

def sentence_tokenize(data):
    return [sent for sent in data.sents]

def lemmatize_tokens(data):
    lemmas = []
    for sent in data:
        sent_tokens = []
        for token in sent:
            candidate = token.lemma_.replace("’", "")
            if (not nlp.vocab[candidate].is_stop and not token.is_punct and
                len(candidate) > 1 and not candidate.isspace()):
                sent_tokens.append(candidate)
        lemmas.append(sent_tokens)
        sent_tokens = []
    return lemmas

def flatten_list(data):
    return list(chain.from_iterable(data))

fix_stop_words()

  and should_run_async(code)


In [62]:
def parse_text(raw_data):
    doc = nlp(raw_data)
    # Retrieve sentences
    sentences = sentence_tokenize(doc)
    # print(len(sentences))
    # Lemmatize + remove stop words
    lemmas = lemmatize_tokens(sentences)
    # print(len(lemmas))
    # Flatten results into a single list
    parsed_text = flatten_list(lemmas)

    return parsed_text

  and should_run_async(code)


Create list with parsed documents

In [63]:
documents = []
query = build_query()
_, not_processed_docs = db_news_extraction("it", query)
for doc in not_processed_docs:
    parsed_doc = parse_text(doc["text"])
    documents.append(parsed_doc)

  and should_run_async(code)


# LDA utility functions

In [64]:
def get_word_collocations(tokens):
    bigrams = Phrases(tokens)
    trigrams = Phrases(bigrams[tokens], min_count=1)
    return list(trigrams[bigrams[tokens]])

def string_to_list(tokens):
    return ast.literal_eval(tokens)

def save_lda_model(ldaModule, location):
    with open(location + ".pickle", "wb") as output:
        pickle.dump(ldaModule, output, pickle.HIGHEST_PROTOCOL)

def load_lda_model(location):
    with open(location + ".pickle", "rb") as input_file:
        ldaModule = pickle.load(input_file)
    return ldaModule

  and should_run_async(code)


# LDA Module implementation

In [65]:
def build_dictionary(doc_collection, use_collocations=True, doc_threshold=3):
    if use_collocations:
        doc_collection = get_word_collocations(doc_collection)
    else:
        doc_collection = [string_to_list(t) for t in doc_collection]

    dictionary = corpora.Dictionary(doc_collection)

    if doc_threshold > 0:
        dictionary.filter_extremes(no_below = doc_threshold)

    return dictionary

def build_corpus(doc_collection, dictionary):
    corpus = [dictionary.doc2bow(list_of_tokens) for list_of_tokens in doc_collection]
    return corpus

def build_lda_model(corpus, dictionary, num_topics = 20, passes = 4, alpha = 0.01, eta = 0.01):
    model = models.LdaModel(corpus,
                            num_topics = num_topics,
                            id2word = dictionary,
                            passes = passes,
                            alpha = [alpha] * num_topics,
                            eta = [eta] * len(dictionary.keys()))
    return model

def get_topics(model, corpus, num_docs):
    topics = [model[corpus[i]] for i in range(num_docs)]
    return topics

  and should_run_async(code)


In [66]:
dictionary = build_dictionary(documents)
corpus = build_corpus(documents, dictionary)

  and should_run_async(code)


# Topic Coherence

In [None]:
def compute_coherence_values(documents, corpus, dictionary, k, a, b, passes):
    model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = passes,
                            alpha = a,
                            eta = b)
    coherence_model_lda = CoherenceModel(model = model, 
                                         texts = documents, 
                                         dictionary = dictionary, 
                                         coherence='c_v')
    return coherence_model_lda.get_coherence()

In [None]:
# Topics range
min_topics = 2
max_topics = 6
topics_range = range(min_topics, max_topics)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))

In [None]:
num_docs = len(corpus)
corpus_sets = [#ClippedCorpus(corpus, num_docs*0.25), 
               #ClippedCorpus(corpus, num_docs*0.5), 
               ClippedCorpus(corpus, int(num_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']

In [None]:
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': [],
                }

In [None]:
# LDA model training passes
passes = 5

# iterate through validation corpuses
for i in range(len(corpus_sets)):
    # iterate through number of topics
    print("~"*30)
    print("Starting corpus set {}".format(i))
    print("~"*30)
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(documents, 
                                              corpus = corpus_sets[i], 
                                              dictionary = dictionary, 
                                              k = k, a = a, b = b, passes = passes)
                # Save the model results
                model_results['Validation_Set'].append(corpus_title[i])
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)

pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)

In [None]:
res = pd.read_csv('lda_tuning_results.csv')
res.head()

In [None]:
#res['Validation_Set'] = res['Validation_Set'].map(lambda x: 75 if x == "75% Corpus" else 100)
res_75 = res[res['Validation_Set'] == "75% Corpus"]
res_100 = res[res['Validation_Set'] == "100% Corpus"]

In [None]:
res_100[res_100['Topics'] == 5]

In [None]:
info_max_c = res_100[res_100['Coherence'] == max(res_100['Coherence'])]
info_max_c

In [None]:
res_100[res_100['Topics'] == 3]

In [67]:
'''k = 4
a = 0.31
b = 0.91'''

k = 3
a = 0.01
b = 0.91

max_cv = 0.0
final_passes = 0

for i in range(0, 50, 10):
    passes = i
    if i == 0:
        passes = 1
    cv = compute_coherence_values(documents, 
                              corpus = corpus, 
                              dictionary = dictionary, 
                              k = k, a = a, b = b, passes = passes)
    if cv > max_cv:
        max_cv = cv
        final_passes = passes
    print("Coherence @{}: {}".format(passes, cv))
print("="*10)
print("{} lead to {} score".format(final_passes, max_cv))

  and should_run_async(code)
Coherence @1: 0.3109742861676696
Coherence @10: 0.35294531179744054
Coherence @20: 0.35021411831454774
Coherence @30: 0.5002789539205879
Coherence @40: 0.40725822238088666
30 lead to 0.5002789539205879 score


In [68]:
final_lda_model = build_lda_model(corpus, 
                            dictionary, 
                            num_topics = k,
                            passes = final_passes,
                            alpha = a,
                            eta = b)

  and should_run_async(code)


In [69]:
final_lda_model.show_topics(formatted=True, num_topics=k, num_words=10)

  and should_run_async(code)


[(0,
  '0.009*"dio" + 0.008*"sanitario" + 0.008*"italiano" + 0.006*"cinese" + 0.006*"Coronavirus" + 0.006*"Italia" + 0.005*"emergenza" + 0.005*"Salute" + 0.005*"Roma" + 0.004*"rientrare"'),
 (1,
  '0.013*"virus" + 0.011*"persona" + 0.010*"potere" + 0.009*"dio" + 0.008*"cinese" + 0.008*"contagiare" + 0.007*"epidemia" + 0.007*"paziente" + 0.006*"Wuhan" + 0.006*"test"'),
 (2,
  '0.012*"cinese" + 0.009*"dio" + 0.009*"potere" + 0.004*"epidemia" + 0.004*"dovere" + 0.004*"partire" + 0.003*"Italia" + 0.003*"Coronavirus" + 0.003*"solere" + 0.003*"virus"')]

In [70]:
cv = CoherenceModel(model = final_lda_model,
            texts = documents, 
            dictionary = dictionary, 
            coherence='c_v')
print(cv.get_coherence())

  and should_run_async(code)
0.3584134965757675


In [71]:
import pyLDAvis.gensim
import pickle
import pyLDAvis

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(final_lda_model, corpus, dictionary)

LDAvis_prepared

  and should_run_async(code)


In [72]:
pyLDAvis.save_html(LDAvis_prepared, 'lda_march.html')

  and should_run_async(code)


# Saving LDA model for pipeline

In [None]:
import sys
sys.path.append('../..')
from core_modules.topic_extraction.lda_module import LdaModule
import pickle5 as pickle

In [None]:
module = LdaModule(trained = False)
module.lang = "it"
module.num_topics = k
module.dictionary = dictionary
module.corpus = corpus
module.model = final_lda_model

In [None]:
with open("tuned_models/lda_model_it_{}_{}.pickle".format(START.strftime("%Y/%m"), END.strftime("%Y/%m")), "wb") as output:
    pickle.dump(module, output, pickle.HIGHEST_PROTOCOL)