# Toxic Comment Classifier DNN 

This notebook will focus on the use of Deep Neural Networks to tackle the problem of tox comment classification. Starting from the work done in the `toxic-comment-classifier-classical-model.ipynb

In [6]:
import pandas as pd 
from  sklearn.model_selection import train_test_split

data = pd.read_csv('./data/train.csv')

X = data['comment_text']
y = data[data.columns[2:]]

# processed_data_set = train_test_split(X, y, test_size=0.33, random_state=42)
# X_train_raw, X_test_raw, y_train, y_test = processed_data_set

In the above code we've started off loading and splitting our data into test/training datasets. We've appended `raw` to the `X_[train|test]` because the comments will be need to be processed and converted to vectors before being feed into a nueural network. 

## Prepraring the data

We'll need to convert our comments to vectors. We're going to do this by assigning an unique ID to each word in our corpus. We'll then convert our comments into vectors  


In [7]:
import pyprind 
import string 
import re 
import numpy as np 
from collections import Counter 
from functools import reduce 

In [8]:
# counts = Counter()
if False:
    pbar = pyprind.ProgBar(len(X), title='Counting Word Occurances')

    def concat_counts(counter_series, comment):
        counter_res, series_res = counter_series
        text = ''.join([c if c not in string.punctuation else ' ' + c + ' ' for c in comment]).lower()
        pbar.update()
        counter = Counter()
        counter.update(text.split())
        return(counter_res + counter, series_res.append(pd.Series(text)))

    # for i, comment in enumerate(X[:10]):
    #     text = ''.join([c if c not in string.punctuation else ' ' + c + ' ' for c in comment]).lower()
    #     pbar.update()

    counts, X_encoded = reduce(concat_counts, X, (Counter(), pd.Series())) 

    word_counts = sorted(counts, key=counts.get, reverse=True)

    word_2_int = {word: ii for ii, word in enumerate(word_counts, 1)}

    pbar = pyprind.ProgBar(len(X), title='Map comments to ints')

    def map_comments(comment):
        mapped_comments = [word_2_int[word] for word in comment.split()]
        pbar.update()
        return mapped_comments

    x_mapped = pd.Series(map(map_comments, X_encoded))

    print(x_mapped.iloc[:5])

In [9]:
# import pickle 
# pickle.dump(x_mapped, open('./pickles/mapped-comments.p', 'wb'))

In [10]:
from bounter import bounter 

def process_comment(comment):
    characters = [c.lower() if c not in string.punctuation else ' ' + c + ' ' for c in comment]
    return ''.join(characters).split()

counts = bounter(size_mb=4096)
X_processed = X.apply(process_comment)
X_processed.apply(lambda x: counts.update(x))
print(counts['sorry'])

word_counts = sorted(counts, key=lambda x: counts[x], reverse=True)

word_2_int = {word: ii for ii, word in enumerate(word_counts, 1)}

def map_comments(comment):
    mapped_comments = [word_2_int[word] for word in comment]
    return mapped_comments
    
X_encoded = pd.Series(map(map_comments, X_processed))

print(X_encoded.iloc[:5])

4729
0    [707, 91, 2, 145, 148, 198, 42, 693, 4530, 116...
1    [184, 9, 16714, 16, 64, 2655, 19, 576, 3810, 6...
2    [434, 445, 3, 6, 9, 83, 152, 21, 276, 5, 90, 3...
3    [4, 72, 6, 48, 9, 32, 114, 69, 352, 1457, 23, ...
4    [10, 3, 1699, 3, 28, 42, 3469, 1, 69, 1088, 10...
dtype: object


We're now going to reduce the sequence length for all of our comments. This will be our first hyper parameter that we can tweak. For simplicity we'll hardcode our sequence length to 75 words. 75 was chosen because it was the value for the 75th percentile of the `comment_word_count` in the temp_data dataframe.

In [11]:
# TODO: refactor this as part of the predict and fit methods
sequence_length = 75
sequences = np.zeros((len(X_encoded), sequence_length), dtype=int)

for i, row in enumerate(X_encoded):
    comments_arr = np.array(row)
    sequences[i, -len(row):] = comments_arr[-sequence_length:]

In [12]:
X_train, X_test, y_train, y_test =  train_test_split(sequences, y, test_size=0.33, random_state=42)

In [13]:
np.random.seed(123)

def create_batch_generator(x, y=None, batch_size=64):
    n_batches = len(x)
    x = x[: n_batches * batch_size]
    if y is not None:
        y = y[:n_batches * batch_size]
    for ii in range(0, len(x), batch_size):
        if y is not None:
            yield x[ii: ii + batch_size], y[ii: ii + batch_size]
        else:
            yield x[ii: ii + batch_size]
                
xs = create_batch_generator(X_train)
xs

<generator object create_batch_generator at 0x7f5fd544c5c8>

In [52]:
import tensorflow as tf 

# n_words = max(list(word_2_int.values())) + 1
# embedding = tf.Variable(tf.random_uniform(shape=(n_words, 256), minval=-1, maxval=1))

# embed_x = tf.nn.embedding_lookup(embedding, )

class ToxicRNN(object):
    def __init__(self, n_words, seq_len=75, lstm_size=256, num_layers=1, batch_size=64, learning_rate=.0001, embed_size=200):
        self.n_words = n_words
        self.seq_len = seq_len
        self.lstm_size = lstm_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embed_size = embed_size
        
        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            
    def build(self):
        ## Define the placholders 
        tf_x = tf.placeholder(tf.int32, shape=(self.batch_size, self.seq_len),
                             name='tf_x')
        tf_y = tf.placeholder(tf.float32, shape=(self.batch_size), name='tf_y')
        tf_keepprob = tf.placeholder(tf.float32, name='tf_keepprob')
        
        ## Define LSTM cell and stack them together
        embedding = tf.Variable(tf.random_uniform(shape=(self.n_words, self.embed_size), minval=-1, maxval=1),
                               name="embedding")
        embed_x = tf.nn.embedding_lookup(embedding, tf_x, name="embedded_x")
        
        ## Define LSTM cell and stack them together 
        cells = tf.contrib.rnn.MultiRNNCell([
            tf.contrib.rnn.DropoutWrapper(
            tf.contrib.rnn.BasicLSTMCell(self.lstm_size), output_keep_prob=tf_keepprob)
            for i in range(self.num_layers)
        ])
        
        ## Define the initial state
        self.initial_state = cells.zero_state(self.batch_size, tf.float32)
        print(' << initial state >> ', self.initial_state)
        
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cells, embed_x, initial_state=self.initial_state)
        
        ## Note: lstm_outputs shape:
        ## [batch_size, max_time, cells.output_size]
        print('\n << lst_output >>', lstm_outputs)
        print('\n << final state >>', self.final_state)
        
        logits = tf.layers.dense(
            inputs=lstm_outputs[:, -1],
            units=1, activation=None,
            name='logits')
        
        logits = tf.squeeze(logits, name='logits_squeezed')
        print('\n << logits >>', logits)
        
        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        predictions = {
            'probabilities': y_proba,
            'labels': tf.cast(tf.round(y_proba), tf.int32,
                             name='labels')
        }
        
        ## Define the cost function 
        cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                labels=tf_y, logits=logits), name='cost'
            )
        
        ## Define the optimiser 
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')
        
    def train(self, X_train, y_train, num_epochs):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            iteration = 1
            for epoch in range(num_epochs):
                state = sess.run(self.initial_state)
                
                for  batch_x, batch_y in create_batch_generator(
                    X_train, y_train, self.batch_size):
                    feed = {'tf_x:0': batch_x,
                           'tf_y:0': batch_y,
                           'tf_keepprob:0': .5,
                           self.initial_state: state}
                    loss, _, state = sess.run(
                        ['cost:0', 'train_op', self.final_state], feed_dict=feed
                    )
                    
                    if iteration % 20 == 0:
                        print("Epoch: %d/%d Iteration: %d "
                              "| Train loss: %.5f" % (epoch + 1, num_epochs, iteration, loss))
                    iteration += 1 
                    if(epoch + 1) % 10 == 0:
                        self.saver.save(sess, "model/sentiment-%d.ckpt" % epoch)
            
    def predict(self, X_data, return_proba=False):
        preds = []
        with tf.Session(graph = self.g) as sess:
            self.saver.restore(
                sess, tf.train.latest_checkpoint('./model'))
            test_state = sess.run(self.initial_state)
            for ii, batch_x in enumerate(create_batch_generator(X_data, None, batch_size=self.batch_size), 1):
                feed = {'tf_x:0': batch_x,
                       'tf_keepprob:0': 1.0,
                       self.initial_state: test_state}
                if return_proba:
                    pred, test_state = sess.run(['probabilities:0', self.final_state],
                                               feed_dict=feed)
                else:
                    pred, test_state = sess.run(
                        ['labels:0', self.final_state],
                        feed_dict=feed)
                preds.append(pred)
        return np.concatenate(preds)

In [61]:
n_words = max(list(word_2_int.values())) + 1

rnn = ToxicRNN(n_words=n_words, seq_len=sequence_length, embed_size=256, lstm_size=128, 
               num_layers=10, batch_size=64, learning_rate=.001)

 << initial state >>  (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(64, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(64, 128) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState_1/BasicLSTMCellZeroState/zeros:0' shape=(64, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState_1/BasicLSTMCellZeroState/zeros_1:0' shape=(64, 128) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState_2/BasicLSTMCellZeroState/zeros:0' shape=(64, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState_2/BasicLSTMCellZeroState/zeros_1:0' shape=(64, 128) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState_3/BasicLSTMCellZeroState/zeros:0' shape=(64, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZero

In [None]:
rnn.train(X_train, y_train['toxic'].iloc, num_epochs=40)

In [63]:
rnn.predict(X_test[:64])

INFO:tensorflow:Restoring parameters from ./model/sentiment-39.ckpt


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)