In [2]:
# Import and restructure data

import pandas as pd
data = pd.read_csv('2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv', error_bad_lines=False)
text = data[['text']]
text['index'] = text.index
documents = text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [3]:
print(len(documents))
print(documents[:5])

40623
                                                text  index
0  It should be legal for a husband to rape his wife      0
1                            dalits are lowlife scum      1
2                                dalits are lowlives      2
3  It was a better world when women dare not ques...      3
4  I don't work this hard so that those immigrant...      4


In [4]:
# Preprocess Data

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
stemmer = PorterStemmer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['My', 'black', 'watch', 'reminds', 'me', 'of', 'all', 'black', 'men']


 tokenized and lemmatized document: 
['black', 'watch', 'remind', 'black']


In [7]:
processed_docs = documents['text'].map(preprocess)
processed_docs[:10]

0              [legal, husband, rape, wife]
1                     [dalit, lowlif, scum]
2                           [dalit, lowliv]
3    [better, world, women, dare, question]
4             [work, hard, immigr, benefit]
5            [work, hard, migrant, benefit]
6            [work, hard, foreign, benefit]
7             [work, hard, hispan, benefit]
8               [work, hard, chav, benefit]
9               [work, hard, paki, benefit]
Name: text, dtype: object

In [8]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 husband
1 legal
2 rape
3 wife
4 dalit
5 lowlif
6 scum
7 lowliv
8 better
9 dare
10 question


In [9]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [10]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(29, 2), (318, 1), (1089, 1)]

In [11]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 29 ("black") appears 2 time.
Word 318 ("watch") appears 1 time.
Word 1089 ("remind") appears 1 time.


In [12]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.4816817283932777),
 (1, 0.5260359103822929),
 (2, 0.524052918770054),
 (3, 0.46544330680430235)]


In [13]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [14]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.022*"countri" + 0.019*"year" + 0.019*"peopl" + 0.015*"allow" + 0.014*"leav" + 0.013*"issu" + 0.012*"chang" + 0.012*"life" + 0.011*"shit" + 0.011*"like"
Topic: 1 
Words: 0.058*"need" + 0.038*"stop" + 0.035*"peopl" + 0.032*"like" + 0.022*"word" + 0.021*"think" + 0.021*"fuck" + 0.017*"feel" + 0.016*"have" + 0.015*"call"
Topic: 2 
Words: 0.025*"good" + 0.021*"time" + 0.019*"thing" + 0.017*"go" + 0.016*"peopl" + 0.014*"work" + 0.012*"children" + 0.011*"talk" + 0.010*"school" + 0.010*"societi"
Topic: 3 
Words: 0.046*"peopl" + 0.034*"think" + 0.025*"wrong" + 0.024*"jew" + 0.023*"make" + 0.019*"kill" + 0.013*"disabl" + 0.012*"kid" + 0.010*"turn" + 0.010*"understand"
Topic: 4 
Words: 0.039*"fuck" + 0.037*"opinion" + 0.031*"want" + 0.024*"come" + 0.022*"think" + 0.015*"express" + 0.015*"idiot" + 0.014*"beauti" + 0.014*"allow" + 0.013*"inform"
Topic: 5 
Words: 0.064*"like" + 0.032*"peopl" + 0.028*"black" + 0.022*"person" + 0.020*"hate" + 0.018*"actual" + 0.013*"world" + 0.013*"

In [15]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.016*"concept" + 0.015*"talk" + 0.012*"love" + 0.011*"like" + 0.010*"women" + 0.010*"peopl" + 0.010*"idiot" + 0.010*"opposit" + 0.009*"chines" + 0.009*"marriag"
Topic: 1 Word: 0.015*"tran" + 0.015*"want" + 0.013*"peopl" + 0.011*"like" + 0.010*"fuck" + 0.010*"think" + 0.009*"inform" + 0.008*"evil" + 0.008*"know" + 0.008*"common"
Topic: 2 Word: 0.016*"peopl" + 0.012*"black" + 0.011*"fuck" + 0.011*"say" + 0.010*"lie" + 0.009*"vile" + 0.008*"like" + 0.008*"disabl" + 0.008*"trust" + 0.008*"racist"
Topic: 3 Word: 0.026*"fuck" + 0.016*"shit" + 0.014*"white" + 0.009*"black" + 0.008*"peopl" + 0.008*"stand" + 0.007*"like" + 0.007*"women" + 0.007*"wanna" + 0.007*"annoy"
Topic: 4 Word: 0.022*"wrong" + 0.016*"peopl" + 0.012*"need" + 0.012*"think" + 0.011*"opinion" + 0.011*"immigr" + 0.010*"stop" + 0.010*"black" + 0.009*"asian" + 0.008*"like"
Topic: 5 Word: 0.015*"black" + 0.015*"marri" + 0.014*"peopl" + 0.013*"women" + 0.012*"love" + 0.010*"race" + 0.010*"know" + 0.010*"ignor" + 0.0

In [16]:
processed_docs[4310]

['black', 'watch', 'remind', 'black']

In [17]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8199864625930786	 
Topic: 0.104*"black" + 0.059*"white" + 0.046*"peopl" + 0.020*"tell" + 0.017*"mean" + 0.015*"start" + 0.014*"think" + 0.013*"matter" + 0.011*"attack" + 0.010*"woman"

Score: 0.02001127414405346	 
Topic: 0.108*"women" + 0.035*"fuck" + 0.016*"shit" + 0.016*"like" + 0.016*"know" + 0.015*"say" + 0.010*"point" + 0.009*"better" + 0.009*"look" + 0.008*"woman"

Score: 0.020001549273729324	 
Topic: 0.064*"like" + 0.032*"peopl" + 0.028*"black" + 0.022*"person" + 0.020*"hate" + 0.018*"actual" + 0.013*"world" + 0.013*"girl" + 0.013*"fuck" + 0.011*"talk"

Score: 0.020000282675027847	 
Topic: 0.048*"love" + 0.045*"peopl" + 0.036*"muslim" + 0.029*"fuck" + 0.029*"know" + 0.023*"hate" + 0.018*"tran" + 0.018*"marri" + 0.016*"get" + 0.014*"want"

Score: 0.020000215619802475	 
Topic: 0.025*"peopl" + 0.019*"best" + 0.018*"friend" + 0.018*"famili" + 0.014*"look" + 0.013*"love" + 0.013*"dont" + 0.012*"explain" + 0.012*"fuck" + 0.012*"want"

Score: 0.020000111311674118	 
Topic: 0.0

In [18]:
unseen_document = 'All Jews deserve to die'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.47969916462898254	 Topic: 0.048*"love" + 0.045*"peopl" + 0.036*"muslim" + 0.029*"fuck" + 0.029*"know"
Score: 0.2536140978336334	 Topic: 0.046*"peopl" + 0.034*"think" + 0.025*"wrong" + 0.024*"jew" + 0.023*"make"
Score: 0.03334129974246025	 Topic: 0.039*"fuck" + 0.037*"opinion" + 0.031*"want" + 0.024*"come" + 0.022*"think"
Score: 0.03333670273423195	 Topic: 0.064*"like" + 0.032*"peopl" + 0.028*"black" + 0.022*"person" + 0.020*"hate"
Score: 0.03333630785346031	 Topic: 0.022*"countri" + 0.019*"year" + 0.019*"peopl" + 0.015*"allow" + 0.014*"leav"
Score: 0.03333505615592003	 Topic: 0.025*"good" + 0.021*"time" + 0.019*"thing" + 0.017*"go" + 0.016*"peopl"
Score: 0.03333483636379242	 Topic: 0.058*"need" + 0.038*"stop" + 0.035*"peopl" + 0.032*"like" + 0.022*"word"
Score: 0.03333430737257004	 Topic: 0.108*"women" + 0.035*"fuck" + 0.016*"shit" + 0.016*"like" + 0.016*"know"
Score: 0.033334117382764816	 Topic: 0.104*"black" + 0.059*"white" + 0.046*"peopl" + 0.020*"tell" + 0.017*"mean"
Score

In [19]:
unseen_document = 'So interesting to see progressive Democrat congresswomen, who originally came from countries whose governments are a complete and total catastrophe, the worst, most corrupt and inept anywhere in the world'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6560420989990234	 Topic: 0.022*"countri" + 0.019*"year" + 0.019*"peopl" + 0.015*"allow" + 0.014*"leav"
Score: 0.17694002389907837	 Topic: 0.048*"love" + 0.045*"peopl" + 0.036*"muslim" + 0.029*"fuck" + 0.029*"know"
Score: 0.113163061439991	 Topic: 0.025*"good" + 0.021*"time" + 0.019*"thing" + 0.017*"go" + 0.016*"peopl"


In [21]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.2846339425358257


In [22]:
coherence_model_lda_tfidf = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda_tfidf = coherence_model_lda_tfidf.get_coherence()
print('\nCoherence Score: ', coherence_lda_tfidf)


Coherence Score:  0.26267082083738985
