In [None]:
import pandas as pd
import gensim         #commonly used for NLP processing tasks (like topic modeling)
from gensim import corpora, models
from pprint import pprint

In [None]:
data = pd.read_csv('sample_data/abcnews-date-text.csv');
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

print("Number of docs: ", len(documents))
print(documents[:5])

Number of docs:  6466
                                        cleaned_text  index
0  enter answer without space crazy trying kind w...      0
1  answer question tried everything nothing seems...      1
2  passed output printed bubble line misleading u...      2
3  problem question try advise bogus variable ans...      3
4  knew bone typo gut feeling submitted without w...      4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


In [None]:
processed_docs = documents['headline_text'].str.split()
dictionary = gensim.corpora.Dictionary(processed_docs) #tokenizes each headline (makes each headline into an array with each word as a separate element)
count = 0
for k, v in dictionary.iteritems(): #prints first 10 entries in the dictionary w/ their unique ID
    print(k, v)
    count += 1
    if count > 10:
        break

NameError: name 'documents' is not defined

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) #filter tokens that appear in less than 15 docs or more than 50% of the docs

In [None]:
#creates a bag of words (BoW) representation for each doc in processed_docs using the dictionary
# BoW format transforms a document (list of words) into a list of (word_id, frequency) tuples, where frequency is the # of times the word appears in the doc
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310] #prints the BoW representation for document 4310

[[(14, 1),
  (16, 1),
  (138, 1),
  (421, 1),
  (1152, 1),
  (1155, 1),
  (2480, 1),
  (2500, 1)],
 [(138, 1), (2766, 1), (2959, 1), (4173, 1)],
 [(54, 1), (633, 1), (4705, 1), (6593, 1)],
 [(643, 1), (1634, 1), (6427, 1), (6594, 1)],
 [(643, 1), (1301, 1), (3720, 1), (6595, 1)]]

In [None]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(
        bow_doc_4310[i][0],                     # unique ID of word
        dictionary[bow_doc_4310[i][0]],         # actual word from dictionary
        bow_doc_4310[i][1]))                    # frequency of word in doc

Word 218 ("govt") appears 1 time.
Word 319 ("group") appears 1 time.
Word 801 ("local") appears 1 time.
Word 1922 ("wants") appears 1 time.
Word 5732 ("compulsory") appears 1 time.
Word 5733 ("ratepayers") appears 1 time.
Word 5734 ("voting") appears 1 time.


In [None]:
tfidf = models.TfidfModel(bow_corpus) # creates a TF-IDF model from the bow_corpus
corpus_tfidf = tfidf[bow_corpus]      # transforms the entire BoW corpus into TF-IDF weighted form
for doc in corpus_tfidf:
    word_tfidf = [(dictionary[id], score) for id, score in doc]
    pprint(word_tfidf)
    #pprint(doc) # pretty-prints the first transformed doc, where each tuple consists of (word ID, TF-IDF score)
    break

[('aba', 0.5170653357672524),
 ('against', 0.2404907204681067),
 ('broadcasting', 0.5010942074514545),
 ('community', 0.28032354288321143),
 ('decides', 0.46172189338729547),
 ('licence', 0.3632407626457737)]


In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2) # training LDA  with BoW corpus
# in this case, common words (eg: "the", "is") may dominate => problematic

In [None]:
# retrieves topics from the LDA model (-1 as the arg means retrieve all topics)
# each topic is a list of words that represents the probabilistic distribution of words within that topic
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic)) # idx is the index/ID of the topic, topic is the list of words associated w/ the topic along with their weights

# each line represents 1 topic that the model has learned from the entire corpus & a document will contain a mixture of these topics

Topic: 0 
Words: 0.093*"to" + 0.025*"for" + 0.015*"nt" + 0.013*"vaccine" + 0.011*"be" + 0.011*"live" + 0.010*"federal" + 0.010*"national" + 0.010*"scott" + 0.009*"nsw"
Topic: 1 
Words: 0.069*"in" + 0.043*"covid" + 0.036*"after" + 0.028*"19" + 0.018*"at" + 0.014*"sydney" + 0.012*"by" + 0.011*"found" + 0.011*"fire" + 0.011*"crash"
Topic: 2 
Words: 0.058*"coronavirus" + 0.031*"in" + 0.028*"of" + 0.021*"victoria" + 0.017*"for" + 0.017*"nsw" + 0.015*"cases" + 0.013*"coast" + 0.012*"south" + 0.012*"australia"
Topic: 3 
Words: 0.045*"over" + 0.040*"of" + 0.026*"police" + 0.024*"in" + 0.020*"man" + 0.019*"court" + 0.013*"murder" + 0.012*"for" + 0.012*"who" + 0.011*"charged"
Topic: 4 
Words: 0.037*"for" + 0.033*"to" + 0.014*"health" + 0.011*"royal" + 0.010*"quarantine" + 0.009*"calls" + 0.009*"new" + 0.009*"workers" + 0.009*"school" + 0.009*"commission"
Topic: 5 
Words: 0.032*"us" + 0.027*"in" + 0.020*"trump" + 0.020*"with" + 0.018*"by" + 0.017*"what" + 0.015*"china" + 0.015*"are" + 0.013*"of" 

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4) # training LDA with TF-IDF corpus
# common words are downweighted & rare words are emphasized => more accurate

for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.4756768047809601	 
Topic: 0.010*"to" + 0.008*"government" + 0.007*"for" + 0.005*"coronavirus" + 0.005*"on" + 0.005*"the" + 0.005*"restrictions" + 0.005*"wednesday" + 0.005*"in" + 0.005*"of"

Score: 0.28774964809417725	 
Topic: 0.009*"to" + 0.007*"health" + 0.007*"the" + 0.007*"for" + 0.006*"of" + 0.005*"mental" + 0.005*"on" + 0.005*"david" + 0.005*"in" + 0.004*"social"

Score: 0.14901325106620789	 
Topic: 0.014*"donald" + 0.012*"in" + 0.009*"after" + 0.008*"crash" + 0.007*"trump" + 0.007*"car" + 0.005*"dies" + 0.005*"on" + 0.005*"at" + 0.005*"man"

Score: 0.012509550899267197	 
Topic: 0.022*"coronavirus" + 0.010*"covid" + 0.010*"cases" + 0.008*"to" + 0.007*"vaccine" + 0.006*"of" + 0.006*"the" + 0.005*"new" + 0.005*"for" + 0.005*"in"

Score: 0.012509046122431755	 
Topic: 0.017*"news" + 0.010*"the" + 0.008*"lockdown" + 0.007*"rural" + 0.007*"markets" + 0.006*"to" + 0.006*"market" + 0.006*"abc" + 0.006*"national" + 0.006*"with"

Score: 0.012508787214756012	 
Topic: 0.009*"why" +