In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy
import re
from nltk.corpus import stopwords

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [5]:
train = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/train.csv')

In [6]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
print("The dataset contains", len(train), "items.")

The dataset contains 159571 items.


Let's separate the data itself and the target class labels into separate variables.

In [8]:
train.index = train['id']
x_train = train['comment_text']
y_train = train.iloc[:, 2:]

Now let's also add a "clean" column to the target variables.

In [9]:
y_train['clean'] = 1 - y_train.sum(axis=1) >= 1  
# beginner note: if some kind of toxicity is detected, the sum across rows will yield one, 
# and the subtraction will give zero, and one otherwise

In [10]:
kinds, counts = zip(*y_train.sum(axis=0).items())
# another beginner note: the sum operation yield a series, and a series behaves like a dictionary
# as it has the items function that returns index-value tuples.

In [11]:
bars = go.Bar(
        y=counts,
        x=kinds,
    )

layout = go.Layout(
    title="Class distribution in train set"
)

fig = go.Figure(data=[bars], layout=layout)
iplot(fig, filename='bar')

So what we see is a very imbalanced dataset with most of the examples being clean. Let's print some comments from each category:

In [12]:
for kind in y_train.columns:
    print('Sample from "{}"'.format(kind))
    x_kind = x_train[y_train[kind]==1].sample(3)
    print("\n".join(x_kind))
    print("\n")

Sample from "toxic"
You fucking guinea wop, greaseball, guido, new york piece of shit.  Go back to your bumfilled, AIDS infested, cesspool of a state.
Get a psychatrist, moron. 

You 9/11 truthers make me sick. paranoid schizophrenics like yourself shouldn't be allowed to use a computer. I'd suggest that you stop lying on your userpage that your are in MENSA - clearly your not. Only retards think 9/11 was an inside job. Oh and take a bath. It's been months since you last took one.
What... 

What are you doing? Don't fuck me. I fuck to Ryulong, not you.


Sample from "severe_toxic"
Why, hello, Mr.Fucking jackass cunt, remember me? I wasnt gone.Suck my fucking cock.
"I would like to extend a fat, robust FUCK YOU to the asshole who deleted my criticism of Schengen.  And you wonder why I continue to vandalize Wikipedia? Its because some fucking asshole from the Wiki community continually deletes whatever I post.  Yes I have used profanity, and yes I have often not followed protocols.  Yet 

These comments are abhorrant, which underlines the importance of the task. We can also see that they really need some normalization:
1. We may consider lowercasing - this is a common operation, but we can see that there are a lot of CAPS in toxic comments, so this might turn out to be a useful feature.
2. Also, there are excessive punctuation and whitespace characters. Whitespaces sure need trimming, while punctuation is very demonstrative of emotions, which are often present in the comments.
3. Another thing is to use a lemmatizer to lower the dimensionality of our vector space (if we use bag of words representation). This will be the next thing to explore.

# Text normalization
We will use spaCy to lemmatize the text (i.e. convert every word into its dictionary form) and we will also load the list of English stopwords (words that appear commonly but do not really convey a lot of meaning, like "the" or "at") from NLTK

In [15]:
nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
stops = stopwords.words("english")

In [16]:
def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)

Now we apply our function to the texts of the comments to normalize them. Note that we can change flags like lowercase or remove_stopwords when we want to try different strategies later.

In [17]:
x_train_lemmatized = x_train.apply(normalize, lowercase=True, remove_stopwords=True)

In [18]:
x_train_lemmatized.sample(1).iloc[0]

"think keep get mix someone else . number good beyond western estimate , however . ' ' ' ' ' '"

# Word frequency visualizations

In [19]:
from collections import Counter
word_counts = dict()

for kind in y_train.columns:
    word_counts[kind] = Counter()
    comments = x_train_lemmatized[y_train[kind]==1]
    for _, comment in comments.iteritems():
        word_counts[kind].update(comment.split(" "))

In [20]:
def most_common_words(kind, num_words=15):
    words, counts = zip(*word_counts[kind].most_common(num_words)[::-1])
    bars = go.Bar(
        y=words,
        x=counts,
        orientation="h"
    )

    layout = go.Layout(
        title="Most common words of the class \"{}\"".format(kind),
        yaxis=dict(
            ticklen=8  # to add some space between yaxis labels and the plot
        )
    )

    fig = go.Figure(data=[bars], layout=layout)
    iplot(fig, filename='bar')

In [24]:
# most_common_words("toxic")
# most_common_words("severe_toxic")
# most_common_words("obscene")
# most_common_words("threat")
# most_common_words("insult")
most_common_words("identity_hate")
# most_common_words("clean")
