Running boilerplate LDA models from gensim for topic modeling.
Inspired by: https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [6]:
import nltk
import spacy
from nltk.corpus import stopwords
import pandas as pd
import gensim
from gensim.utils import simple_preprocess

# Implicit Hate Dataset

In [7]:
# load dataset and tokenize
df = pd.read_csv('./../../data/original_datasets/implicit_hate_v1_stg1_posts.tsv', sep='\t')
df['tokenized'] = df['post'].apply(lambda x: simple_preprocess(x))
df.head()

Unnamed: 0,post,class,tokenized
0,""" : jewish harvard professor noel ignatiev w...",implicit_hate,"[jewish, harvard, professor, noel, ignatiev, w..."
1,b.higher education is a part of european cult...,not_hate,"[higher, education, is, part, of, european, cu..."
2,"has a problem with "" the whites "" "" and "" "" ...",not_hate,"[has, problem, with, the, whites, and, the, ch..."
3,is yasir qadhi a hate preacher for calling ch...,not_hate,"[is, yasir, qadhi, hate, preacher, for, callin..."
4,"rt "" : how three million germans mass murder...",not_hate,"[rt, how, three, million, germans, mass, murde..."


In [8]:
# combine all the tokenized texts into one list
all_tokenized_text = []
for i in range(len(df)):
    all_tokenized_text.extend(df['tokenized'][i])
print(len(all_tokenized_text))

309030


In [9]:
# build bigram and trigram models
bigram = gensim.models.Phrases(all_tokenized_text, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[all_tokenized_text], threshold=100)
bigram_maker = gensim.models.phrases.Phraser(bigram)
trigram_maker = gensim.models.phrases.Phraser(trigram)

In [10]:
# remove stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
all_tokenized_text = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in all_tokenized_text]

In [11]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
# form bigrams
all_tokenized_text_bi = [bigram_maker[doc] for doc in all_tokenized_text]
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
all_tokenized_text_bi = lemmatization(all_tokenized_text_bi)
# drop all the empty lists
all_tokenized_text_bi = [doc for doc in all_tokenized_text_bi if len(doc) > 0]
print(all_tokenized_text_bi[:20])

[['jewish'], ['professor'], ['ignatiev'], ['want'], ['abolish'], ['white'], ['race'], ['high'], ['education'], ['part'], ['european'], ['culture'], ['import'], ['continent'], ['asian'], ['culture'], ['pressure'], ['people'], ['aim'], ['good']]


In [13]:
dictionary = gensim.corpora.Dictionary(all_tokenized_text_bi)
corpus = [dictionary.doc2bow(doc) for doc in all_tokenized_text_bi]

In [14]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=100, update_every=1, chunksize=100, passes=10, per_word_topics=True)

In [15]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda.print_topics(num_topics=10, num_words=10))

[(0,
  '0.508*"get" + 0.071*"old" + 0.068*"free" + 0.027*"change" + 0.016*"rt" + '
  '0.011*"german" + 0.000*"woman" + 0.000*"know" + 0.000*"guy" + 0.000*"make"'),
 (1,
  '0.451*"people" + 0.244*"call" + 0.026*"hate" + 0.010*"supremacist" + '
  '0.000*"woman" + 0.000*"take" + 0.000*"let" + 0.000*"guy" + 0.000*"know" + '
  '0.000*"difference"'),
 (2,
  '0.351*"go" + 0.202*"well" + 0.028*"murder" + 0.018*"anti" + 0.000*"spencer" '
  '+ 0.000*"woman" + 0.000*"know" + 0.000*"guy" + 0.000*"difference" + '
  '0.000*"take"'),
 (3,
  '0.205*"kill" + 0.179*"country" + 0.079*"today" + 0.038*"watch" + '
  '0.000*"woman" + 0.000*"know" + 0.000*"let" + 0.000*"take" + 0.000*"make" + '
  '0.000*"guy"'),
 (4,
  '0.465*"white" + 0.021*"european" + 0.000*"woman" + 0.000*"guy" + '
  '0.000*"know" + 0.000*"take" + 0.000*"difference" + 0.000*"let" + '
  '0.000*"girl" + 0.000*"look"'),
 (5,
  '0.228*"right" + 0.172*"jewish" + 0.070*"arrest" + 0.062*"race" + '
  '0.000*"woman" + 0.000*"let" + 0.000*"take" + 

In [16]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda, texts=all_tokenized_text_bi, dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score:', coherence_lda)

Coherence Score: -21.63953594966157


UMass Measure: https://mimno.infosci.cornell.edu/papers/mimno-semantic-emnlp.pdf

In [17]:
import pickle 
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda, corpus, dictionary)
LDAvis_prepared


  default_term_info = default_term_info.sort_values(
