In [54]:
import re, time
from collections import OrderedDict

# Visualisation / Data handling
import pandas as pd
import seaborn as sns

# Cleaning and Preprocessing
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# Importing Gensim
import gensim
from gensim import corpora

# database 
from db_mongo import connect, get_db

connect('mongodb://server-grapefruit.quving.com:27027/textminer')
db_cursor = get_db().noz_en.find()

NUM_DOCUMENTS = 500
NUM_TOPICS = 20
TESTSET_SIZE = 2

def get_texts(db_cursor, num_documents):
    actual_docs_count = 0
    content_dict0 = OrderedDict()
    for article in db_cursor.limit(num_documents):
        actual_docs_count += 1
        pattern = re.compile("-\d+")
        key = re.sub(pattern, '', article.get('slug'))
        content = article.get('content_en')
        if content:
            content_dict0[key] = content

    print(f"Using {len(content_dict0)} unique documents out of {actual_docs_count} total retrieved. ({num_documents} requested)")

    # for headline, content in content_dict0.items():
    #     print('KEY: '+str(headline))
    #     print('CONTENT: '+str(content))
    #     print()
    
    return content_dict0

In [55]:
def prepare_texts(content_dict0):
    doc_complete = [str(doc) for doc in content_dict0.values()]
    # doc_complete

    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    # exclude
    lemma = WordNetLemmatizer()
    def clean(doc):
        stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
        punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
        return normalized

    doc_clean = [clean(doc).split() for doc in doc_complete]  
    # doc_clean
    # len(doc_clean)

    # Preparing Document-Term Matrix

    # Creating the term dictionary of our courpus, where every unique term is assigned an index. 
    dictionary = corpora.Dictionary(doc_clean)
    # dictionary

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # doc_term_matrix

    return doc_complete, doc_clean, dictionary, doc_term_matrix

In [56]:
content_dict = get_texts(db_cursor, num_documents=505)
keys_train = list(content_dict.keys())[:-TESTSET_SIZE]
keys_test = list(content_dict.keys())[-TESTSET_SIZE:]
print(f"{len(keys_train)} docs for training and {len(keys_test)} docs for testing.")

Using 239 unique documents out of 505 total retrieved. (505 requested)
237 docs for training and 2 docs for testing.


In [57]:
list(content_dict.keys())[:3]

['preise-fuer-kinderbefoerderung-in-der-gemeinde-berge-steigen',
 'keine-wohnung-fuer-frau-mit-zigeunereinschlag',
 'mit-dino-und-star-wars-so-argumentiert-ein-us-senator-gegen-klimaschutz']

In [58]:
content_dict_train = {k:v for k,v in content_dict.items() if k in keys_train}
doc_complete, doc_clean, dictionary, doc_term_matrix = prepare_texts(content_dict_train)
len(content_dict_train)

237

In [59]:
content_dict_test = {k:v for k,v in content_dict.items() if k in keys_test}
doc_complete2, doc_clean2, dictionary2, doc_term_matrix2 = prepare_texts(content_dict_test)
# content_dict_test
len(content_dict_test)

2

In [61]:
set(content_dict_train.keys()) & set(content_dict_test.keys())  # should be empty, so no overlap

set()

In [52]:
import time
NUM_TOPICS = 20

In [64]:
# Running LDA Model
start_time = time.time()
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=NUM_TOPICS, id2word = dictionary, passes=50)
elapsed_time = time.time() - start_time
print(f"Training the LDA model with {len(doc_term_matrix)} documents took {elapsed_time} seconds.")

Training the LDA model took 19.849446058273315 seconds.


In [67]:
topics = ldamodel.get_topics()

# Results

# print(ldamodel.print_topics(num_topics=5, num_words=3))
# ldamodel.print_topics(num_topics=-1, num_words=400)

for i in range(min(ldamodel.num_topics,3)):
    print(ldamodel.print_topic(i))
#     print()

0.006*"care" + 0.006*"also" + 0.005*"bauhaus" + 0.005*"people" + 0.004*"would" + 0.004*"april" + 0.004*"like" + 0.004*"example" + 0.004*"always" + 0.003*"district"
0.009*"house" + 0.007*"minister" + 0.007*"vote" + 0.006*"may" + 0.006*"district" + 0.005*"new" + 0.005*"prime" + 0.005*"party" + 0.005*"percent" + 0.005*"however"
0.012*"euro" + 0.012*"million" + 0.006*"year" + 0.006*"2018" + 0.005*"also" + 0.003*"müstak" + 0.003*"club" + 0.003*"said" + 0.003*"two" + 0.003*"around"


In [69]:
# dictionary.id2token

In [71]:
import pandas as pd
import seaborn as sns

In [72]:
df_topics = pd.DataFrame(ldamodel.get_topics())
df_topics.rename(columns=dictionary.id2token, inplace=True)
# df_topics['police']
df_topics

Unnamed: 0,046,084,12450,13,14,15,18,20,20132014,2017,...,renton,restore,seattle,sinnett,specific,strict,technician,traditionally,urgently,vain
0,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,0.000656,...,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05
1,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,0.000498,0.000498,2.4e-05,2.4e-05,2.4e-05,...,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05
2,1.6e-05,1.6e-05,1.6e-05,1.6e-05,0.001319,1.6e-05,0.000689,1.6e-05,1.6e-05,0.001319,...,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05
3,2.7e-05,2.7e-05,2.7e-05,2.7e-05,0.001104,0.000566,2.7e-05,0.000565,2.7e-05,2.7e-05,...,2.7e-05,2.7e-05,2.7e-05,2.7e-05,2.7e-05,2.7e-05,2.7e-05,2.7e-05,2.7e-05,2.7e-05
4,3e-05,3e-05,3e-05,0.001214,0.002398,0.001214,3e-05,3e-05,3e-05,0.000622,...,3e-05,3e-05,3e-05,3e-05,3e-05,3e-05,3e-05,3e-05,3e-05,3e-05
5,3.3e-05,3.3e-05,3.3e-05,3.3e-05,0.000701,3.3e-05,3.3e-05,0.000701,3.3e-05,0.001362,...,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05
6,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,0.000459,2.2e-05,2.2e-05,0.000509,...,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05
7,0.000172,0.000172,0.000172,0.000335,0.000662,0.00088,0.000335,0.00081,0.000172,0.000502,...,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06
8,1.9e-05,1.9e-05,1.9e-05,0.001156,1.9e-05,0.000411,0.000777,1.9e-05,1.9e-05,1.9e-05,...,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05
9,1.9e-05,1.9e-05,1.9e-05,1.9e-05,0.000411,0.000395,1.9e-05,0.000721,0.000394,1.9e-05,...,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05


In [73]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
pyLDAvis.enable_notebook()

In [50]:
start_time = time.time()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus=doc_term_matrix, dictionary=dictionary)
vis
elapsed_time = time.time() - start_time
print(f"Visualising the LDA model with {len(doc_term_matrix)} documents took {elapsed_time} seconds.")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [20]:
pyLDAvis

<module 'pyLDAvis' from '/Users/simon/.py_envs/news_find_me/lib/python3.6/site-packages/pyLDAvis/__init__.py'>

In [74]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [75]:
len(doc_clean)

237

In [76]:
# doc_clean[0]

In [77]:
# doc_complete[0]

In [78]:
# len(doc_term_matrix)

In [79]:
# doc_term_matrix

In [80]:
# whos

In [81]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel, corpus=doc_term_matrix, texts=doc_clean)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(30)

[(7, 0.9951012)]
[(17, 0.99610513)]
[(3, 0.047277804), (6, 0.052349437), (12, 0.021604009), (14, 0.066958934), (15, 0.80907)]
[(4, 0.48449624), (12, 0.4918124)]
[(2, 0.99469125)]
[(8, 0.98733044)]
[(7, 0.29563367), (13, 0.60224235), (14, 0.038058333), (17, 0.060986754)]
[(2, 0.9971203)]
[(6, 0.9956809)]
[(2, 0.97204936)]
[(16, 0.9925528)]
[(0, 0.18556845), (8, 0.80617034)]
[(15, 0.99619913)]
[(12, 0.9946317)]
[(0, 0.02607599), (7, 0.61561084), (16, 0.3502993)]
[(16, 0.9963728)]
[(19, 0.9888202)]
[(4, 0.98549116), (15, 0.012095315)]
[(4, 0.9965322)]
[(19, 0.99420595)]
[(13, 0.9925759)]
[(18, 0.9969447)]
[(14, 0.9971884)]
[(2, 0.99692434)]
[(7, 0.119793914), (13, 0.016825262), (15, 0.06739131), (18, 0.79253983)]
[(8, 0.9985732)]
[(12, 0.9902039)]
[(10, 0.994507)]
[(10, 0.9784014)]
[(9, 0.99722946)]
[(7, 0.97933674)]
[(11, 0.9904012)]
[(0, 0.99402314)]
[(18, 0.97431594)]
[(11, 0.04253647), (15, 0.9526998)]
[(10, 0.99263203)]
[(5, 0.997411)]
[(16, 0.8416019)]
[(7, 0.99009657)]
[(0, 0.99040

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,7.0,0.9951,"year, also, work, school, euro, first, distric...","[bus, transport, community, organizes, boy, gi..."
1,1,17.0,0.9961,"year, also, time, deer, euro, team, first, fal...","[note, prospectus, sheet, clear, quot1, pers, ..."
2,2,15.0,0.8091,"year, accident, number, injured, child, new, p...","[ever, since, donald, trump, became, president..."
3,3,12.0,0.4918,"fire, easter, police, also, god, flensburg, ta...","[47yearold, stopped, hitzacker, cat, wanted, c..."
4,4,2.0,0.9947,"euro, million, year, 2018, also, müstak, club,...","[profit, deutsche, bahn, declined, significant..."
5,5,8.0,0.9873,"pirate, party, deckarm, also, handball, year, ...","[time, weekend, springlike, temperature, retur..."
6,6,13.0,0.6022,"also, election, city, said, sophia, year, hagi...","[city, sleepy, loveable, dirty, overcrowded, a..."
7,7,2.0,0.9971,"euro, million, year, 2018, also, müstak, club,...","[osnabrück, closed, past, year, record, result..."
8,8,6.0,0.9957,"time, said, ship, migrant, according, people, ...","[dangerous, find, ratzeburg, lake, near, utech..."
9,9,2.0,0.972,"euro, million, year, 2018, also, müstak, club,...","[oil, price, responded, tweet, discount, first..."


In [67]:
df_dominant_topic.shape

(238, 5)