# Toxic Comment Classifier DNN 

This notebook will focus on the use of Deep Neural Networks to tackle the problem of tox comment classification. Starting from the work done in the `toxic-comment-classifier-classical-model.ipynb

In [8]:
import pandas as pd 
from  sklearn.model_selection import train_test_split

data = pd.read_csv('./data/train.csv')

X = data['comment_text']
y = data[data.columns[2:]]

# processed_data_set = train_test_split(X, y, test_size=0.33, random_state=42)
# X_train_raw, X_test_raw, y_train, y_test = processed_data_set

In the above code we've started off loading and splitting our data into test/training datasets. We've appended `raw` to the `X_[train|test]` because the comments will be need to be processed and converted to vectors before being feed into a nueural network. 

## Prepraring the data

We'll need to convert our comments to vectors. We're going to do this by assigning an unique ID to each word in our corpus. We'll then convert our comments into vectors  


In [38]:
import pyprind 
import string 
import re 
import numpy as np 
from collections import Counter 
from functools import reduce 

In [37]:
# counts = Counter()
pbar = pyprind.ProgBar(len(X), title='Counting Word Occurances')

def concat_counts(counter_series, comment):
    counter_res, series_res = counter_series
    text = ''.join([c if c not in string.punctuation else ' ' + c + ' ' for c in comment]).lower()
    pbar.update()
    counter = Counter()
    counter.update(text.split())
    return(counter_res + counter, series_res.append(pd.Series(text)))

# for i, comment in enumerate(X[:10]):
#     text = ''.join([c if c not in string.punctuation else ' ' + c + ' ' for c in comment]).lower()
#     pbar.update()

counts, X_encoded = reduce(concat_counts, X, (Counter(), pd.Series())) 

word_counts = sorted(counts, key=counts.get, reverse=True)

word_2_int = {word: ii for ii, word in enumerate(word_counts, 1)}

pbar = pyprind.ProgBar(len(X), title='Map comments to ints')

def map_comments(comment):
    mapped_comments = [word_2_int[word] for word in comment.split()]
    pbar.update()
    return mapped_comments
    
x_mapped = pd.Series(map(map_comments, X_encoded))

print(x_mapped.iloc[:5])

Counting Word Occurances
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 04:10:51
Map comments to ints


0    explanation\nwhy the edits made under my usern...
0    d ' aww !  he matches this background colour i...
0    hey man ,  i ' m really not trying to edit war...
0     " \nmore\ni can ' t make any real suggestions...
0    you ,  sir ,  are my hero .  any chance you re...
0     " \n\ncongratulations from me as well ,  use ...
0         cocksucker before you piss around on my work
0    your vandalism to the matt shirvington article...
0    sorry if the word  ' nonsense '  was offensive...
0    alignment on this subject and which are contra...
0     " \nfair use rationale for image : wonju . jp...
0    bbq \n\nbe a man and lets discuss it - maybe o...
0    hey .  .  .  what is it .  . \n @   |  talk  ....
0    before you start throwing accusations and warn...
0    oh ,  and the girl above started her arguments...
0     " \n\njuelz santanas age\n\nin 2002 ,  juelz ...
0    bye !  \n\ndon ' t look ,  come or think of co...
0    redirect talk : voydan pop georgiev -  chernod...
0    the m

0% [##############################] 100% | ETA: 00:00:00

0    [706, 91, 2, 145, 148, 198, 42, 693, 4507, 114...
1    [184, 9, 16468, 16, 64, 2654, 19, 576, 3810, 6...
2    [434, 445, 3, 6, 9, 83, 152, 21, 276, 5, 90, 3...
3    [4, 72, 6, 48, 9, 32, 114, 69, 352, 1455, 23, ...
4    [10, 3, 1696, 3, 28, 42, 3463, 1, 69, 1087, 10...
dtype: object



Total time elapsed: 00:00:05


In [43]:
import pickle 
pickle.dump(x_mapped, open('./pickles/mapped-comments.p', 'wb'))

We're now going to reduce the sequence length for all of our comments. This will be our first hyper parameter that we can tweak. For simplicity we'll hardcode our sequence length to 75 words. 75 was chosen because it was the value for the 75th percentile of the `comment_word_count` in the temp_data dataframe.

In [67]:
# TODO: refactor this as part of the predict and fit methods
sequence_length = 75
sequences = np.zeros((len(x_mapped), sequence_length), dtype=int)

for i, row in enumerate(x_mapped):
    comments_arr = np.array(row)
    sequences[i, -len(row):] = comments_arr[-sequence_length:]

In [74]:
X_train, X_test, y_train, y_test =  train_test_split(sequences, y, test_size=0.33, random_state=42)

In [87]:
np.random.seed(123)

def create_batch_generator(x, y=None, batch_size=64):
    n_batches = len(x)
    x = x[: n_batches * batch_size]
    if y is not None:
        y = y[:n_batches * batch_size]
    for ii in range(0, len(x), batch_size):
        if y is not None:
            yield x[ii: ii + batch_size], y[ii: ii + batch_size]
        else:
            yield x[ii: ii + batch_size]
                
xs = create_batch_generator(X_train)
xs

<generator object create_batch_generator at 0x114060bf8>

In [106]:
import tensorflow as tf 

# n_words = max(list(word_2_int.values())) + 1
# embedding = tf.Variable(tf.random_uniform(shape=(n_words, 256), minval=-1, maxval=1))

# embed_x = tf.nn.embedding_lookup(embedding, )

class ToxicRNN(object):
    def __init__(self, n_words, seq_len=75, lstm_size=256, num_layers=1, batch_size=64, learning_rate=.0001, embed_size=200):
        self.n_words = n_words
        self.seq_len = seq_len
        self.lstm_size = lstm_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embed_size = embed_size
        
        self.g = tf.Graph()
        with self.g.as_default():
            self.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            
    def build(self):
        ## Define the placholders 
        tf_x = tf.placeholder(tf.int32, shape=(self.batch_size, self.seq_len),
                             name='tf_x')
        tf_y = tf.placeholder(tf.float32, shape=(self.batch_size), name='tf_y')
        tf_keepprob = tf.placeholder(tf.float32, name='tf_keepprob')
        
        ## Define LSTM cell and stack them together
        embedding = tf.Variable(tf.random_uniform(shape=(self.n_words, self.embed_size), minval=-1, maxval=1),
                               name="embedding")
        embed_x = tf.nn.embedding_lookup(embedding, tf_x, name="embedded_x")
        
        ## Define LSTM cell and stack them together 
        cells = tf.contrib.rnn.MultiRNNCell([
            tf.contrib.rnn.DropoutWrapper(
            tf.contrib.rnn.BasicLSTMCell(self.lstm_size), output_keep_prob=tf_keepprob)
            for i in range(self.num_layers)
        ])
        
        ## Define the initial state
        self.init_state = cells.zero_state(self.batch_size, tf.float32)
        print(' << initial state >> ', self.inital_state)
        
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cells, embed_x, initial_state=self.initial_state)
        
        ## Note: lstm_outputs shape: