Running boilerplate LDA models from gensim for topic modeling.
Inspired by: https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [18]:
import nltk
import spacy
from nltk.corpus import stopwords
import pandas as pd
import gensim
from gensim.utils import simple_preprocess

# Offenseval Dataset

In [19]:
offenseval_train = pd.read_csv("./../../data/offenseval_train.csv")
offenseval_test = pd.read_csv("./../../data/offenseval_test.csv")
offenseval_val = pd.read_csv("./../../data/offenseval_val.csv")
# combine all three datasets
df = pd.concat([offenseval_train, offenseval_test, offenseval_val], ignore_index=True)

In [20]:
# load dataset and tokenize
df['tokenized'] = df['text'].apply(lambda x: simple_preprocess(x))
df.head()

Unnamed: 0,text,label,tokenized
0,@USER She is not leaving BB,0,"[user, she, is, not, leaving, bb]"
1,@USER @USER Ford and the conservatives hates t...,0,"[user, user, ford, and, the, conservatives, ha..."
2,@USER God is good to us :folded_hands: :thumbs...,0,"[user, god, is, good, to, us, folded_hands, th..."
3,@USER woman accusing Supreme Court nominee Kav...,0,"[user, woman, accusing, supreme, court, nomine..."
4,@USER CORRECTION: The Liberals won a false-maj...,0,"[user, correction, the, liberals, won, false, ..."


In [21]:
# combine all the tokenized texts into one list
all_tokenized_text = []
for i in range(len(df)):
    all_tokenized_text.extend(df.loc[i, 'tokenized'])


In [22]:
# build bigram and trigram models
bigram = gensim.models.Phrases(all_tokenized_text, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[all_tokenized_text], threshold=100)
bigram_maker = gensim.models.phrases.Phraser(bigram)
trigram_maker = gensim.models.phrases.Phraser(trigram)

In [23]:
# remove stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'user', 'url'])
all_tokenized_text = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in all_tokenized_text]

In [24]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [25]:
# form bigrams
all_tokenized_text_bi = [bigram_maker[doc] for doc in all_tokenized_text]
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
all_tokenized_text_bi = lemmatization(all_tokenized_text_bi)
# drop all the empty lists
all_tokenized_text_bi = [doc for doc in all_tokenized_text_bi if len(doc) > 0]

In [26]:
dictionary = gensim.corpora.Dictionary(all_tokenized_text_bi)
corpus = [dictionary.doc2bow(doc) for doc in all_tokenized_text_bi]

In [27]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=100, update_every=1, chunksize=100, passes=10, per_word_topics=True)

In [28]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda.print_topics(num_topics=10, num_words=10))

[(0,
  '0.490*"liberal" + 0.122*"come" + 0.000*"lie" + 0.000*"even" + 0.000*"year" '
  '+ 0.000*"shit" + 0.000*"thing" + 0.000*"talk" + 0.000*"really" + '
  '0.000*"love"'),
 (1,
  '0.258*"get" + 0.184*"people" + 0.133*"make" + 0.101*"need" + 0.095*"well" + '
  '0.000*"lie" + 0.000*"even" + 0.000*"year" + 0.000*"shit" + 0.000*"thing"'),
 (2,
  '0.383*"maga" + 0.110*"hate" + 0.024*"democratic" + 0.011*"mouth" + '
  '0.000*"cut" + 0.000*"lie" + 0.000*"even" + 0.000*"shit" + 0.000*"year" + '
  '0.000*"thing"'),
 (3,
  '0.219*"want" + 0.215*"right" + 0.103*"leave" + 0.045*"attack" + 0.000*"lie" '
  '+ 0.000*"even" + 0.000*"shit" + 0.000*"year" + 0.000*"thing" + '
  '0.000*"love"'),
 (4,
  '0.390*"say" + 0.132*"also" + 0.022*"assault" + 0.000*"lie" + 0.000*"even" + '
  '0.000*"shit" + 0.000*"year" + 0.000*"thing" + 0.000*"love" + 0.000*"time"'),
 (5,
  '0.124*"use" + 0.095*"white" + 0.094*"fuck" + 0.086*"tweet" + 0.053*"seem" + '
  '0.045*"court" + 0.028*"freedom" + 0.024*"close" + 0.017*"t

In [29]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda, texts=all_tokenized_text_bi, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score:', coherence_lda)

Coherence Score: 0.8521643028672626


UMass Measure: https://mimno.infosci.cornell.edu/papers/mimno-semantic-emnlp.pdf

In [30]:
import pickle 
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda, corpus, dictionary)
LDAvis_prepared


  default_term_info = default_term_info.sort_values(
