In [1]:
import pycmf

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path

# Read Data

The [Toxic Commments Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) was a competition to classify comments into various categories of toxicity.


In this tutorial, we will see how CMF can extract relevant topics to classification using the toxicity labels as supervision, and how that cannot be accomplished using other topic modeling techniques.

Before running this code, you'll need to download the data from [here](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data) and put it in a directory of your choice (make sure the below DATA variable points to the directory where the data resides).

In [4]:
DATA = Path("data")

In [5]:
train = pd.read_csv(DATA / "train.csv")

Let's take a look at the data

In [6]:
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [7]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


# Preprocess Data

We will now construct the text data to do topic modeling on

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [9]:
vectorizer = CountVectorizer(min_df=3, max_df=0.9, stop_words="english", binary=True)

In [10]:
%%time
X_train = vectorizer.fit_transform(train.head(10000).comment_text)

CPU times: user 577 ms, sys: 3.56 ms, total: 580 ms
Wall time: 580 ms


In [11]:
X_train.shape

(10000, 9927)

The label data to decompose

In [12]:
from scipy.sparse import csc_matrix, csr_matrix

In [13]:
label_columns = list(train.columns[2:]); label_columns

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [14]:
Y_train = train.head(10000)[label_columns].values

In [15]:
Y_train.shape

(10000, 6)

# Attempt Decomposition

In [16]:
N_COMPONENTS = 20

Analysis functions we'll use later

In [17]:
idx_to_word = np.array(vectorizer.get_feature_names())
def print_topics(H, topn=10):
    for i, topic in enumerate(H): 
        print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-topn:]]])))

### NMF

Let's see what topics we find using NMF

In [18]:
from sklearn.decomposition import NMF

In [19]:
nmf = NMF(N_COMPONENTS)

In [20]:
%%time
W = nmf.fit_transform(X_train); H = nmf.components_

CPU times: user 3.68 s, sys: 10 ms, total: 3.69 s
Wall time: 3.69 s


In [21]:
print_topics(H)

Topic 1: used,right,case,said,point,make,fact,way,say,does
Topic 2: tildes,hello,questions,date,welcome,sign,using,editing,pages,help
Topic 3: placed,speedily,guidelines,tag,add,remove,speedy,note,deleted,deletion
Topic 4: message,added,comments,user,wp,comment,discussion,redirect,contribs,talk
Topic 5: pov,history,subject,discussion,good,main,created,needs,section,article
Topic 6: user,said,want,doesn,say,care,understand,need,really,don
Topic 7: considered,editors,users,http,stop,policy,en,wiki,org,wikipedia
Topic 8: bit,new,sounds,hey,feel,thing,really,things,looks,like
Topic 9: let,editor,right,mean,ll,really,going,wanted,say,just
Topic 10: say,wiki,dont,got,thing,didn,want,did,let,know
Topic 11: vandalize,make,stop,pages,continue,edits,editing,blocked,did,edit
Topic 12: post,read,message,delete,links,new,link,discussion,user,page
Topic 13: life,way,really,wiki,person,going,group,world,want,people
Topic 14: way,added,new,lot,ll,work,help,good,hi,thanks
Topic 15: right,way,section,su

None of these topics seems related to toxicity...

The hyperparameter alpha determines the trade-off between how much to value the decomposition of one matrix over the other when computing the loss. It only affects the results when using the newton solver.

We'll use a very heuristic method of computing alpha here.

In [22]:
xnorm = np.sqrt(X_train.multiply(X_train).sum())
ynorm = np.sqrt((Y_train * Y_train).sum())
alpha = (ynorm / (xnorm + ynorm)) ** 0.75; alpha

0.1643312620900889

### CMF with non-negative constraint on all matrices

In [23]:
cmf_nn = pycmf.CMF(N_COMPONENTS,
                   U_non_negative=True, V_non_negative=True, Z_non_negative=True,
                   alpha=alpha, solver="mu", verbose=True)

In [24]:
%%time
U, V, Z = cmf_nn.fit_transform(csr_matrix(X_train.T), Y_train)

Epoch 10 reached after 6.567 seconds, error: 234.839634
Epoch 20 reached after 13.161 seconds, error: 234.113588
Epoch 30 reached after 19.860 seconds, error: 234.033264
Epoch 40 reached after 27.461 seconds, error: 233.994436
Epoch 50 reached after 36.267 seconds, error: 233.970200
CPU times: user 25.4 s, sys: 11.1 s, total: 36.5 s
Wall time: 36.5 s


In [25]:
cmf_nn.print_topic_terms(vectorizer)

Topic 1 [0.000,0.000,0.000,0.000,0.000,0.000]: right,said,case,used,point,fact,way,make,say,does
Topic 2 [0.000,0.000,0.000,0.002,0.000,0.000]: date,hello,ask,welcome,sign,questions,using,editing,pages,help
Topic 3 [0.000,0.000,0.000,0.000,0.000,0.000]: information,free,articles,tag,add,criteria,note,speedy,deletion,deleted
Topic 4 [0.000,0.000,0.000,0.000,0.000,0.000]: remove,message,comments,comment,wp,user,redirect,discussion,contribs,talk
Topic 5 [0.000,0.000,0.000,0.000,0.000,0.000]: version,history,wp,created,pov,main,needs,discussion,section,article
Topic 6 [6.421,0.970,4.473,0.233,4.192,0.721]: dick,life,suck,bitch,stop,ass,stupid,shit,fucking,fuck
Topic 7 [0.090,0.000,0.000,0.000,0.000,0.000]: vandalism,users,editors,stop,wiki,en,policy,org,articles,wikipedia
Topic 8 [0.017,0.000,0.000,0.000,0.000,0.000]: hey,bit,really,thing,new,sounds,look,things,looks,like
Topic 9 [0.004,0.000,0.000,0.000,0.000,0.000]: hi,got,didn,mean,ll,going,let,say,wanted,just
Topic 10 [0.001,0.000,0.00

The results seem better(?). Now we see some topics that are actually related to toxicity. Let's see what happens when we allow negativity.

### CMF with non-negative constraint on only the topic matrices

In [26]:
cmf = pycmf.CMF(N_COMPONENTS,
               U_non_negative=True, V_non_negative=True, Z_non_negative=False,
               x_link="linear", y_link="logit", alpha=alpha, l1_reg=2., l2_reg=5., max_iter=10,
               solver="newton", verbose=True)

In [27]:
%%time
U, V, Z = cmf.fit_transform(csr_matrix(X_train.T), Y_train)

Epoch 10 reached after 233.713 seconds, error: 135.584596
CPU times: user 3min 34s, sys: 19.9 s, total: 3min 53s
Wall time: 3min 53s


In [28]:
cmf.print_topic_terms(vectorizer)

Topic 1 [-0.047,-0.323,-0.057,-0.469,-0.015,-0.257]: hate,little,hell,life,hey,know,shit,ass,fucking,fuck
Topic 2 [0.139,-3.046,-2.502,-3.067,-2.442,-3.153]: say,wikipedia,like,articles,know,just,time,think,people,don
Topic 3 [-1.022,-0.368,0.075,-0.461,-0.314,-0.382]: non,ask,material,read,reason,article,page,want,wikipedia,deleted
Topic 4 [-5.596,-5.963,-4.806,-6.224,-5.267,-6.076]: don,confirm,accepted,companies,indicate,year,bands,company,tagging,club
Topic 5 [-6.810,-4.408,-6.029,-3.981,-5.526,-4.095]: way,page,wikipedia,right,talk,user,use,section,know,article
Topic 6 [-0.698,-0.271,-0.237,-0.501,-0.341,-0.307]: editors,way,talk,does,article,make,wikipedia,page,questions,like
Topic 7 [-4.465,-3.540,-4.501,-3.125,-4.039,-3.068]: edit,really,page,article,just,think,like,people,know,don
Topic 8 [0.516,0.147,0.988,-0.568,0.920,0.194]: gets,avoid,stupid,right,man,suck,nazi,bitch,faggot,fuck
Topic 9 [2.863,-1.721,1.025,-1.698,0.904,-1.538]: contributing,create,takes,learn,tests,policy,

There now seems to be a lot more provocative and obscene topics. It's interesting to see how universal words like f\*\*k are in toxic comments.