Running boilerplate LDA models from gensim for topic modeling.
Inspired by: https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [1]:
import nltk
import spacy
from nltk.corpus import stopwords
import pandas as pd
import gensim
from gensim.utils import simple_preprocess

# COVID Hate Dataset

In [2]:
# load dataset and tokenize
covid_df = pd.read_csv('./../../data/original_datasets/covid_hate.csv')
covid_df['tokenized'] = covid_df['Text'].apply(lambda x: simple_preprocess(x))
covid_df.head()

Unnamed: 0,Tweet ID,Text,label,tokenized
0,1242553623260868608,Are we still allowed to quote ancient Chinese ...,0,"[are, we, still, allowed, to, quote, ancient, ..."
1,1246508137638580225,@mamacat2u @VBeltiz More power to you! This C...,0,"[mamacat, vbeltiz, more, power, to, you, this,..."
2,1233468243534372865,"CNBC: WHO, Tedros reiterated that the virus co...",0,"[cnbc, who, tedros, reiterated, that, the, vir..."
3,1243626072387747841,"""The heightened racism experienced by Asian co...",1,"[the, heightened, racism, experienced, by, asi..."
4,1225611530978217989,Coronavirus and Nepali in China: KP Oli has di...,0,"[coronavirus, and, nepali, in, china, kp, oli,..."


In [3]:
# combine all the tokenized texts into one list
all_tokenized_text = []
for i in range(len(covid_df)):
    all_tokenized_text.extend(covid_df['tokenized'][i])
print(len(all_tokenized_text))

59841


In [4]:
# build bigram and trigram models
bigram = gensim.models.Phrases(all_tokenized_text, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[all_tokenized_text], threshold=100)
bigram_maker = gensim.models.phrases.Phraser(bigram)
trigram_maker = gensim.models.phrases.Phraser(trigram)

In [5]:
# remove stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
all_tokenized_text = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in all_tokenized_text]

In [6]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [7]:
# form bigrams
all_tokenized_text_bi = [bigram_maker[doc] for doc in all_tokenized_text]
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
all_tokenized_text_bi = lemmatization(all_tokenized_text_bi)
# drop all the empty lists
all_tokenized_text_bi = [doc for doc in all_tokenized_text_bi if len(doc) > 0]
print(all_tokenized_text_bi[:20])

[['still'], ['allow'], ['quote'], ['ancient'], ['chinese'], ['proverb'], ['racist'], ['racismisavirus'], ['mamacat'], ['power'], ['chinese'], ['virus'], ['thing'], ['really'], ['show'], ['crazy'], ['low'], ['iq'], ['people'], ['go']]


In [8]:
dictionary = gensim.corpora.Dictionary(all_tokenized_text_bi)
corpus = [dictionary.doc2bow(doc) for doc in all_tokenized_text_bi]

In [9]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=100, update_every=1, chunksize=100, passes=10, per_word_topics=True)

In [10]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda.print_topics(num_topics=10, num_words=10))

[(0,
  '0.192*"fuck" + 0.114*"racism" + 0.070*"many" + 0.055*"back" + '
  '0.023*"really" + 0.014*"right" + 0.000*"world" + 0.000*"say" + 0.000*"kill" '
  '+ 0.000*"ccp"'),
 (1,
  '0.458*"coronavirus" + 0.086*"go" + 0.045*"iamnotavirus" + 0.023*"keep" + '
  '0.015*"asianamerican" + 0.004*"bring" + 0.000*"world" + 0.000*"say" + '
  '0.000*"kill" + 0.000*"ccp"'),
 (2,
  '0.273*"call" + 0.033*"community" + 0.030*"patient" + 0.025*"line" + '
  '0.000*"world" + 0.000*"say" + 0.000*"kill" + 0.000*"ccp" + 0.000*"fucking" '
  '+ 0.000*"trump"'),
 (3,
  '0.508*"chinese" + 0.041*"show" + 0.036*"year" + 0.000*"world" + 0.000*"say" '
  '+ 0.000*"kill" + 0.000*"ccp" + 0.000*"fucking" + 0.000*"trump" + '
  '0.000*"come"'),
 (4,
  '0.340*"covid" + 0.184*"get" + 0.004*"turn" + 0.000*"world" + 0.000*"say" + '
  '0.000*"kill" + 0.000*"ccp" + 0.000*"fucking" + 0.000*"come" + '
  '0.000*"trump"'),
 (5,
  '0.351*"virus" + 0.204*"asian" + 0.146*"country" + 0.000*"world" + '
  '0.000*"say" + 0.000*"kill" + 0

In [11]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda, texts=all_tokenized_text_bi, dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score:', coherence_lda)

Coherence Score: -22.105412822786864


UMass Measure: https://mimno.infosci.cornell.edu/papers/mimno-semantic-emnlp.pdf

In [14]:
import pickle 
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda, corpus, dictionary)
LDAvis_prepared


  default_term_info = default_term_info.sort_values(
