# 1. Import Libraries

In [0]:
import pandas as pd
import numpy as np
import os

#LDA
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary

#Visualizing topics
import pyLDAvis.gensim

#tokenizing
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
reddit_corpus = pd.read_csv(os.path.join("data", "reddit_corpus_cleaned.csv"))
reddit_corpus.head(2)

Unnamed: 0,title,score,comms_num,time,body,comms,en_titles,en_bodies,posts,word_len_posts,word_len_comms,word_len_en_titles,word_len_en_bodies,word_len_ro_titles,word_len_ro_bodies,posts_prep,posts_prep_lim,word_len_posts_lim
0,protocol de tratament coronavirus RO,0,5,2020-03-17 05:20:49,,Sa vezi cum se golesc farmaciile de preparatel...,the protocol for the treatment of a coronaviru...,,the protocol for the treatment of a coronaviru...,9,109,9,0,5,0,protocol treatment coronavirus ro,protocol treatment coronavirus ro,4
1,Ce soluții credeți ca ar trebui implementate î...,0,1,2020-03-17 08:34:17,,Nu se ocupa nimeni cu asa ceva glovo asta de c...,What do you think it would have to be implemen...,,What do you think it would have to be implemen...,21,43,21,0,15,0,think would implement come period fight corona...,think would implement come period fight corona...,7


In [0]:
reddit_corpus['posts_prep_lim'] = reddit_corpus['posts_prep_lim'].astype(str)
reddit_corpus['posts_prep_lim_tok'] = reddit_corpus['posts_prep_lim'].apply(word_tokenize)  

# 2. Apply LDA to corpus(Latent Dirichlet Allocation)

In [0]:
dictionary_posts = Dictionary(reddit_corpus['posts_prep_lim_tok'].tolist())
type(dictionary_posts)

gensim.corpora.dictionary.Dictionary

In [0]:
corpus = reddit_corpus['posts_prep_lim_tok'].to_list()
final_corpus = [dictionary_posts.doc2bow(text) for text in corpus]

In [0]:
final_corpus[0] # let's check the first post 

[(0, 1), (1, 1), (2, 1), (3, 1)]

In [0]:
# No more deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [0]:
#Implement LDA
%%time
ldamodel = LdaModel(final_corpus, 
                    num_topics = 2,
                    id2word = dictionary_posts, 
                    passes  = 15, 
                    random_state = 5)

CPU times: user 33.1 s, sys: 0 ns, total: 33.1 s
Wall time: 33.1 s


In [0]:
# Let's check the topics
topics = ldamodel.print_topics(num_words=2)

In [0]:
topics

[(0, '0.029*"coronavirus" + 0.015*"romania"'), (1, '0.014*"get" + 0.013*"go"')]

In [0]:
topics_10 = ldamodel.print_topics(num_words=10)
topics_10
# Conclusion: we can make out the topics judging from these words

[(0,
  '0.029*"coronavirus" + 0.015*"romania" + 0.009*"state" + 0.007*"emergency" + 0.007*"hospital" + 0.007*"usa" + 0.006*"news" + 0.006*"new" + 0.006*"order" + 0.006*"thank"'),
 (1,
  '0.014*"get" + 0.013*"go" + 0.010*"people" + 0.008*"like" + 0.008*"know" + 0.008*"one" + 0.008*"time" + 0.007*"would" + 0.007*"make" + 0.007*"say"')]

In [0]:
# For a nicer representation
for topic in topics:
  print(topic)

(0, '0.029*"coronavirus" + 0.015*"romania"')
(1, '0.014*"get" + 0.013*"go"')


# 3. Visualization

In [0]:
lda_display = pyLDAvis.gensim.prepare(ldamodel,
                                      final_corpus,
                                      dictionary_posts,
                                      sort_topics=False)
pyLDAvis.display(lda_display)