In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import pandas as pd 
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emreb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
stemmer = SnowballStemmer("english")

In [3]:

data = pd.read_csv('C:/Users/emreb/Documents/projects/newonelast/redditdata//abcnews-date-text.csv', on_bad_lines="skip")
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [4]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [5]:
len(data)

1244184

In [6]:
doc_sample = documents[documents['index'] == 40310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['actor', 'launches', 'coffs', 'green', 'plan']


 tokenized and lemmatized document: 
['actor', 'launch', 'coff', 'green', 'plan']


In [7]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [8]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [9]:
dictionary.filter_extremes(no_below=5, no_above=0.1, keep_n= 100000)

In [10]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[5200]

[(166, 1), (244, 1), (296, 1), (597, 1), (856, 1), (3738, 1), (3739, 1)]

In [11]:
bow_doc_5200 = bow_corpus[5200]
for i in range(len(bow_doc_5200)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_5200[i][0], 
                                               dictionary[bow_doc_5200[i][0]], 
bow_doc_5200[i][1]))

Word 166 ("govt") appears 1 time.
Word 244 ("group") appears 1 time.
Word 296 ("vote") appears 1 time.
Word 597 ("local") appears 1 time.
Word 856 ("want") appears 1 time.
Word 3738 ("compulsori") appears 1 time.
Word 3739 ("ratepay") appears 1 time.


In [12]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5844216176085719),
 (1, 0.38716866963787633),
 (2, 0.5013820927104505),
 (3, 0.5071171375845095)]


In [13]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary,  workers=2,passes=2)

In [14]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.019*"australian" + 0.018*"perth" + 0.017*"miss" + 0.017*"arrest" + 0.013*"tasmanian" + 0.013*"quarantin" + 0.013*"interview" + 0.012*"bank" + 0.011*"hous" + 0.011*"search"
Topic: 1 
Words: 0.024*"news" + 0.019*"market" + 0.018*"canberra" + 0.014*"chang" + 0.013*"australian" + 0.013*"royal" + 0.013*"care" + 0.011*"age" + 0.011*"fall" + 0.011*"climat"
Topic: 2 
Words: 0.044*"queensland" + 0.024*"coronavirus" + 0.021*"coast" + 0.020*"state" + 0.018*"tasmania" + 0.017*"home" + 0.015*"scott" + 0.013*"gold" + 0.012*"andrew" + 0.012*"power"
Topic: 3 
Words: 0.041*"victoria" + 0.036*"elect" + 0.036*"sydney" + 0.023*"crash" + 0.021*"die" + 0.018*"lockdown" + 0.016*"morrison" + 0.012*"guilti" + 0.011*"biden" + 0.011*"road"
Topic: 4 
Words: 0.059*"australia" + 0.054*"covid" + 0.033*"trump" + 0.019*"vaccin" + 0.018*"donald" + 0.018*"test" + 0.017*"world" + 0.016*"record" + 0.016*"coronavirus" + 0.014*"south"
Topic: 5 
Words: 0.021*"health" + 0.020*"nation" + 0.018*"school" + 0.0

Running LDA using TF-IDF

In [15]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.010*"lockdown" + 0.010*"australia" + 0.009*"final" + 0.009*"world" + 0.008*"scott" + 0.006*"open" + 0.006*"beat" + 0.006*"turnbul" + 0.006*"australian" + 0.006*"juli"
Topic: 1 Word: 0.020*"trump" + 0.011*"govern" + 0.010*"countri" + 0.007*"coronavirus" + 0.007*"chang" + 0.007*"hour" + 0.007*"health" + 0.006*"fund" + 0.006*"climat" + 0.006*"say"
Topic: 2 Word: 0.010*"guilti" + 0.009*"jail" + 0.009*"sentenc" + 0.008*"assault" + 0.008*"sexual" + 0.008*"pandem" + 0.007*"david" + 0.007*"quarantin" + 0.007*"mother" + 0.006*"plead"
Topic: 3 Word: 0.013*"restrict" + 0.011*"royal" + 0.010*"coronavirus" + 0.010*"andrew" + 0.009*"commiss" + 0.009*"friday" + 0.008*"age" + 0.008*"thursday" + 0.008*"care" + 0.008*"michael"
Topic: 4 Word: 0.015*"crash" + 0.011*"miss" + 0.010*"polic" + 0.010*"woman" + 0.009*"dead" + 0.009*"search" + 0.008*"death" + 0.008*"kill" + 0.008*"die" + 0.007*"mental"
Topic: 5 Word: 0.010*"live" + 0.009*"coronavirus" + 0.008*"price" + 0.008*"monday" + 0.008*"co

Performance evaluation by classifying sample

In [18]:
for index, score in sorted(lda_model[bow_corpus[5200]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8874775767326355	 
Topic: 0.031*"govern" + 0.022*"live" + 0.018*"call" + 0.016*"restrict" + 0.014*"island" + 0.013*"water" + 0.013*"return" + 0.012*"alleg" + 0.012*"say" + 0.011*"plan"

Score: 0.012506099417805672	 
Topic: 0.021*"health" + 0.020*"nation" + 0.018*"school" + 0.015*"minist" + 0.015*"busi" + 0.014*"indigen" + 0.013*"communiti" + 0.012*"fund" + 0.011*"hospit" + 0.011*"work"

Score: 0.01250217854976654	 
Topic: 0.028*"face" + 0.025*"peopl" + 0.019*"protest" + 0.016*"street" + 0.013*"citi" + 0.011*"mental" + 0.011*"michael" + 0.011*"turn" + 0.010*"trial" + 0.010*"sport"

Score: 0.012502127327024937	 
Topic: 0.040*"polic" + 0.032*"case" + 0.025*"death" + 0.021*"charg" + 0.021*"year" + 0.020*"court" + 0.019*"murder" + 0.017*"bushfir" + 0.016*"help" + 0.013*"famili"

Score: 0.01250199694186449	 
Topic: 0.019*"australian" + 0.018*"perth" + 0.017*"miss" + 0.017*"arrest" + 0.013*"tasmanian" + 0.013*"quarantin" + 0.013*"interview" + 0.012*"bank" + 0.011*"hous" + 0.011*"sea

Performance evaluation by classifying sample document using LDA TF-IDF 

In [19]:
for index, score in sorted(lda_model_tfidf[bow_corpus[5200]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5084957480430603	 
Topic: 0.020*"trump" + 0.011*"govern" + 0.010*"countri" + 0.007*"coronavirus" + 0.007*"chang" + 0.007*"hour" + 0.007*"health" + 0.006*"fund" + 0.006*"climat" + 0.006*"say"

Score: 0.2068500518798828	 
Topic: 0.026*"news" + 0.015*"rural" + 0.010*"stori" + 0.009*"nation" + 0.009*"sport" + 0.008*"busi" + 0.007*"social" + 0.007*"speak" + 0.007*"insid" + 0.006*"august"

Score: 0.19710545241832733	 
Topic: 0.010*"live" + 0.009*"coronavirus" + 0.008*"price" + 0.008*"monday" + 0.008*"covid" + 0.007*"market" + 0.007*"australian" + 0.007*"record" + 0.006*"cattl" + 0.006*"rain"

Score: 0.012508881278336048	 
Topic: 0.010*"coronavirus" + 0.009*"covid" + 0.009*"elect" + 0.008*"australia" + 0.008*"south" + 0.007*"updat" + 0.006*"wednesday" + 0.006*"violenc" + 0.006*"north" + 0.006*"mark"

Score: 0.012507136911153793	 
Topic: 0.022*"donald" + 0.016*"drum" + 0.010*"weather" + 0.010*"tuesday" + 0.009*"wall" + 0.008*"alan" + 0.008*"christma" + 0.007*"street" + 0.007*"celebr"

### Testing Section

In [20]:
unseen_document= '''I met a traveller from an antique land,
                    Who said—“Two vast and trunkless legs of stone
                    Stand in the desert. . . . Near them, on the sand,
                    Half sunk a shattered visage lies, whose frown,
                    And wrinkled lip, and sneer of cold command,
                    Tell that its sculptor well those passions read
                    Which yet survive, stamped on these lifeless things,
                    The hand that mocked them, and the heart that fed;
                    And on the pedestal, these words appear:
                    My name is Ozymandias, King of Kings;'''

print(unseen_document)
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

I met a traveller from an antique land,
                    Who said—“Two vast and trunkless legs of stone
                    Stand in the desert. . . . Near them, on the sand,
                    Half sunk a shattered visage lies, whose frown,
                    And wrinkled lip, and sneer of cold command,
                    Tell that its sculptor well those passions read
                    Which yet survive, stamped on these lifeless things,
                    The hand that mocked them, and the heart that fed;
                    And on the pedestal, these words appear:
                    My name is Ozymandias, King of Kings;
Score: 0.1953313648700714	 Topic: 0.023*"attack" + 0.023*"china" + 0.022*"warn" + 0.019*"north" + 0.018*"kill"
Score: 0.14337611198425293	 Topic: 0.024*"news" + 0.019*"market" + 0.018*"canberra" + 0.014*"chang" + 0.013*"australian"
Score: 0.14110662043094635	 Topic: 0.059*"australia" + 0.054*"covid" + 0.033*"trump" + 0.019*"vaccin" + 0.018*"donald"
Score: 

In [21]:
from gensim.models import CoherenceModel

In [22]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
# coherence_model_lda = CoherenceModel(model= lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.126624749097992


In [23]:
# import zipfile
import pyLDAvis
import pyLDAvis.gensim 

  from imp import reload


In [26]:
pyLDAvis.enable_notebook()
visal = pyLDAvis.gensim.prepare(lda_model,  bow_corpus,dictionary)
visal
# Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.
# A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

  default_term_info = default_term_info.sort_values(


In [None]:
#  results shows consistency that's nice

 TFI-IDF Table

In [29]:
visaltf_idf = pyLDAvis.gensim.prepare(lda_model_tfidf,  corpus_tfidf,dictionary)
visaltf_idf

  default_term_info = default_term_info.sort_values(


## Salient Topics

In [30]:
sal_lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
print('\nPerplexity: ', sal_lda_model.log_perplexity(bow_corpus)) 

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(sal_lda_model, bow_corpus, dictionary)
vis

In [None]:
# Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[dictionary]

In [None]:
import zipfile
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip

In [None]:
with zipfile.ZipFile('/mallet-2.0.8.zip', 'r') as zip_ref:#the directory of zipped file
    zip_ref.extractall('C:/Users/emreb/Documents/projects/newonelast') #directory of unzipped file

In [None]:
import os

os.environ.update({
        'MALLET_HOME':
        r"/content/drive/MyDrive/dataAI/mallet/mallet-2.0.8/bin/mallet",
    })

In [None]:
mallet_path = '/content/drive/MyDrive/dataAI/mallet/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)