# Topic Modeling on Harmful Comments with LDA and NMF

In [1]:
# Standard libraries
import pandas as pd
import numpy as np

# Scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

### Load in data

In [2]:
data = pd.read_pickle('data_cleaned.pkl')

In [3]:
# Subset data to harmful comments only
data = data[data.harmful == 1]

In [4]:
comments = data['model_text'].tolist()

### How long are the comments?

In [5]:
# Find average length of quotes by word and by characters
# Initialize count lists
word_length = []

# Iterate through each quote and find lengths
for comment in comments:
    word_length.append(len(comment.split(' ')))
    
# Calculate means
word_mean = int(round(np.mean(word_length)))

# View averages
print('The average number of words in a toxic comment is:', word_mean)

The average number of words in a toxic comment is: 27


### Setup for Topic Display

In [6]:
def display_topics(model, feature_names, no_top_words):
    """
    *Source: Aneesha Bakharia, 
    https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
    
    Function that takes in a model, feature_names,
    and no_top_words and displays topics and top
    words in a readible fashion.
    
    :param: model: sklearn.decomposition
    :param: feature_names: list
    :param: no_top_words: int
    
    :returns: printed topics and top words
    """
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print()

In [7]:
# Set number of features
no_features = 1000

### Non-negative Matrix Factorization (NMF)

In [8]:
# NMF using tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(comments)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# Set number of topics
no_topics = 6

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [9]:
# View results
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
fuck asshole shut cunt faggot hey want mother bitch fuckin

Topic 1:
like wikipedia page people know stop talk article stupid think

Topic 2:
fucking cunt faggot asshole mother moron retard life idiot hope

Topic 3:
suck dick cock balls hey asshole big bitch cunt ass

Topic 4:
shit bitch ass piece little eat son faggot fuckin hell

Topic 5:
gay fag faggot sex ass ur im homosexual like likes



### Latent Dirichlet Allocation (LDA)

In [10]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(comments)
tf_feature_names = tf_vectorizer.get_feature_names()

# Set number of topics
no_topics = 10

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [11]:
# View results
no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
hate ass cock fag sex yourselfgo lick nice lmao nl33ers

Topic 1:
sucks nipple stupid penis huge faggots love dickhead vandalism wiki

Topic 2:
fuck shit bitch shut piece fucking fuckin bitches cocksucking admins

Topic 3:
nigger faggot aids cunt eat wiki equalsequalsequalsequals loser noobs cuntbag

Topic 4:
like people know article wikipedia think time life admin fucking

Topic 5:
wikipedia page die bullshit dont asshole talk block pig care

Topic 6:
suck hi fucking dick dog bastard know mother balls pussy

Topic 7:
gay moron equalsequals fucker bark cocksucker super mothjer like homo

Topic 8:
fat jew buttsecks bush want poop retarded god fack chicken

Topic 9:
idiot freedom old hey damn im rape like hitler hope

