In [118]:
# Import relevant packages
import pandas as pd
import numpy as np
import string
from tqdm import tqdm
import random

# plotly imports
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# import statistical functions
from statistics import *

# NLP library imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import nltk

tqdm.pandas()

In [48]:
# Read the csv file into a pandas dataframe
train_df = pd.read_csv('train.csv')
# Look at the first 10 rows
train_df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [52]:
# Check for any missing values
print('Check for missing values')
print(train_df.isnull().sum())
print('Check for NA/NaN')
print(train_df.isna().sum())

Check for missing values
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64
Check for NA/NaN
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


In [43]:
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# Aggregate labels into new column 'target' to simplify
train_df['target'] = train_df[LABELS].max(axis=1)
# Drop the original labels
train_df.drop(labels=LABELS, axis=1, inplace=True)

In [38]:
# What percentage of comments are toxic?

# Dataframe with only toxic comments
toxic_df = train_df[train_df['target'] == 1]
# Dataframe with only nontoxic comments
clean_df = train_df[train_df['target'] == 0]

num_toxic = len(toxic_df)
num_clean = len(nontoxic_df)
print(f'There are {num_toxic} toxic comments')
print(f'There are {num_clean} toxic comments')
percent_toxic = (100 * num_toxic / (num_toxic + num_clean))
print(f'{percent_toxic}% of comments are toxic')

There are 16225 toxic comments
There are 143346 toxic comments
10.167887648758233% of comments are toxic


In [71]:
# Let's look at some example clean comments
print('\n------------------\n'.join([comment for comment in clean_df.head(5)['comment_text']]))

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
------------------
D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
------------------
Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
------------------
"
More
I can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have an

In [72]:
# Let's look at some example toxic comments
print('\n------------------\n'.join([comment for comment in toxic_df.head(5)['comment_text']]))

COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
------------------
Hey... what is it..
@ | talk .
What is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?

------------------
Bye! 

Don't look, come or think of comming back! Tosser.
------------------
You are gay or antisemmitian? 

Archangel WHite Tiger

Meow! Greetingshhh!

Uh, there are two ways, why you do erased my comment about WW2, that holocaust was brutally slaying of Jews and not gays/Gypsys/Slavs/anyone...

1 - If you are anti-semitian, than shave your head bald and go to the skinhead meetings!

2 - If you doubt words of the Bible, that homosexuality is a deadly sin, make a pentagram tatoo on your forehead go to the satanistic masses with your gay pals!


Beware of the Dark Side!
------------------
FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!


### Ngram analysis

In [90]:
# SpaCy Parser for questions
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

parser = English()
def clean_text(text):
    mytokens = parser(text)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [91]:
# functions to generate ngrams from text

def get_words(text):
    word = list(text.split())
    return word
def get_bigrams(text):
    bgram = list(nltk.bigrams(text.split()))
    bgram = [' '.join((a, b)) for (a, b) in bgram]
    return bgram
def get_trigrams(text):
    tgram = list(nltk.trigrams(text.split()))
    tgram = [' '.join((a, b, c)) for (a, b, c) in tgram]
    return tgram

In [116]:
# functions for visualizing ngram plots

color_brewer = ['#57B8FF','#B66D0D','#009FB7','#FBB13C','#FE6847','#4FB5A5','#8C9376','#F29F60','#8E1C4A','#85809B','#515B5D','#9EC2BE','#808080','#9BB58E','#5C0029','#151515','#A63D40','#E9B872','#56AA53','#CE6786','#449339','#2176FF','#348427','#671A31','#106B26','#008DD5','#034213','#BC2F59','#939C44','#ACFCD9','#1D3950','#9C5414','#5DD9C1','#7B6D49','#8120FF','#F224F2','#C16D45','#8A4F3D','#616B82','#443431','#340F09']

def ngram_visualizer(v,t):
    X = v.values
    Y = v.index
    trace = [go.Bar(
                y=Y,
                x=X,
                orientation = 'h',
                marker=dict(color=color_brewer, line=dict(color='rgb(8,48,107)',width=1.5,)),
                opacity = 0.6
    )]
    layout = go.Layout(
        title=t,
        margin = go.Margin(
            l = 200,
            r = 400
        )
    )

    fig = go.Figure(data=trace, layout=layout)
    iplot(fig, filename='horizontal-bar')
    
def ngram_plot(ngrams,title):
    ngram_list = []
    for i in tqdm(ngrams.values, total=ngrams.shape[0]):
        ngram_list.extend(i)
    random.shuffle(color_brewer)
    ngram_visualizer(pd.Series(ngram_list).value_counts()[:20],title)

In [114]:
clean = clean_df['comment_text'].iloc[:10000].apply(clean_text)
toxic = toxic_df['comment_text'].apply(clean_text)





 61%|██████    | 6098/10000 [00:26<00:10, 370.37it/s][A[A[A[A

In [119]:
# What are the top words in clean comments?
clean_words = clean.apply(get_words)
ngram_plot(clean_words, 'Top Clean Words')






  0%|          | 0/10000 [00:00<?, ?it/s][A[A[A[A[A





plotly.graph_objs.Margin is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Margin




### Meta-feature analysis

In [85]:
# Distribution plot
def plot_distribution(a,b,title,bins=0.1,colors=['#3A4750', '#F64E8B']):
    trace1 = ff.create_distplot([a,b], ["Clean comments","Toxic comments"], bin_size=bins, colors=colors, show_rug=False)
    trace1['layout'].update(title=title)
    iplot(trace1, filename='Distplot')
    table_data= [["Statistical Measures","Clean comments","Toxic comments"],
                ["Mean",mean(a),mean(b)],
                ["Standard Deviation",pstdev(a),pstdev(b)],
                ["Variance",pvariance(a),pvariance(b)],
                ["Median",median(a),median(b)],
                ["Maximum value",max(a),max(b)],
                ["Minimum value",min(a),min(b)]]
    trace2 = ff.create_table(table_data)
    iplot(trace2, filename='Table')

In [86]:
# What is the distribution of comment length for clean and toxic comments?

# Get the lengths of the comment text of clean and toxic comments
length_clean = np.array(clean_df['comment_text'].apply(len))
length_toxic = np.array(toxic_df['comment_text'].apply(len))
plot_distribution(length_clean, length_toxic, 'Comment Length', bins=40)