In [1]:
import pycmf

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path

In [4]:
DATA = Path("data")

# Read Data

In [5]:
train = pd.read_csv(DATA / "train.csv")
test = pd.read_csv(DATA / "test.csv")

In [6]:
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


# Preprocess Data

The text data to do topic modeling on

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [8]:
vectorizer = CountVectorizer(min_df=3, max_df=0.9, binary=True)

In [9]:
%%time
X_train = vectorizer.fit_transform(train.head(10000).comment_text)

CPU times: user 18.4 s, sys: 252 ms, total: 18.7 s
Wall time: 18.9 s


In [10]:
X_train.shape

(159571, 52750)

The label data to decompose

In [11]:
from scipy.sparse import csc_matrix, csr_matrix

In [12]:
label_columns = list(train.columns[2:]); label_columns

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [22]:
Y_train = train[label_columns]

In [23]:
Y_train.shape

(159571, 6)

# Attempt Decomposition

In [16]:
N_COMPONENTS = 20

Analysis functions we'll use later

In [17]:
idx_to_word = np.array(vectorizer.get_feature_names())
def print_topics(H):
    for i, topic in enumerate(H): 
        print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

### NMF

In [18]:
from sklearn.decomposition import NMF

In [19]:
nmf = NMF(N_COMPONENTS)

In [30]:
%%time
W = nmf.fit_transform(X_train); H = nmf.components_

CPU times: user 1min 33s, sys: 473 ms, total: 1min 33s
Wall time: 1min 34s


In [32]:
print_topics(H)

Topic 1: by,what,at,but,about,so,one,was,all,with
Topic 2: re,can,don,do,my,know,what,me,your,you
Topic 3: we,going,need,just,about,like,utc,at,the,to
Topic 4: guidelines,remove,speedy,not,note,has,don,deleted,deletion,do
Topic 5: edit,hello,editing,wikipedia,sign,will,welcome,how,pages,help
Topic 6: do,editing,to,blocked,so,this,please,will,if,from
Topic 7: don,because,think,like,just,so,if,was,but,it
Topic 8: who,why,there,does,do,but,or,they,are,not
Topic 9: comments,your,comment,discussion,the,user,my,talk,page,on
Topic 10: more,see,good,work,reason,time,your,the,thanks,for
Topic 11: if,thank,been,other,please,use,any,or,wikipedia,at
Topic 12: the,here,one,he,an,what,no,there,this,is
Topic 13: only,then,find,think,see,fact,but,so,was,that
Topic 14: or,there,think,will,to,if,can,would,should,be
Topic 15: removed,other,he,his,about,all,more,has,with,and
Topic 16: would,am,an,any,my,to,we,no,been,have
Topic 17: from,part,out,list,some,all,by,one,the,of
Topic 18: which,section,case,fro

### CMF with non-negative constraint on all matrices

In [24]:
cmf_nn = pycmf.CMF(N_COMPONENTS,
                   U_non_negative=True, V_non_negative=True, Z_non_negative=True,
                   solver="newton", verbose=True, sg_sample_ratio=0.3)

In [None]:
%%time
U, V, Z = cmf_nn.fit_transform(csr_matrix(X_train.T), Y_train)

Epoch 10 reached after 216846.972 seconds, error: 80.235932


### CMF with non-negative constraint on only the topic matrices

In [None]:
cmf = pycmf.CMF(N_COMPONENTS,
               U_non_negative=True, V_non_negative=True, Z_non_negative=False,
               x_link="linear", y_link="sigmoid",  
               solver="newton", verbose=True, sg_sample_ratio=0.3)