# Toxic Comment Classifier DNN 

This notebook will focus on the use of Deep Neural Networks to tackle the problem of tox comment classification. Starting from the work done in the `toxic-comment-classifier-classical-model.ipynb

In [31]:
import pandas as pd 
from  sklearn.model_selection import train_test_split
from keras.utils import plot_model

data = pd.read_csv('./data/train.csv')
print(data.shape)
X = data['comment_text'].values
y = data[data.columns[2:]].values

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

(159571, 8)


In [32]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(X_train)
tokenized_train = tokenizer.texts_to_sequences(X_train)
tokenized_test = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index


In [33]:
from keras.preprocessing.sequence import pad_sequences

max_len = 70

padded_train = pad_sequences(tokenized_train, maxlen=max_len, padding='post')
padded_test = pad_sequences(tokenized_test, maxlen=max_len, padding='post')
print(X_train[0])
print(tokenized_train[0])
print(padded_train[0])



Grandma Terri Should Burn in Trash 
Grandma Terri is trash. I hate Grandma Terri. F%%K her to HELL! 71.74.76.40
[12927, 8296, 56, 3980, 10, 4414, 12927, 8296, 8, 4414, 7, 398, 12927, 8296, 871, 1369, 184, 2, 866, 1697, 2609, 1738, 1336]
[12927  8296    56  3980    10  4414 12927  8296     8  4414     7   398
 12927  8296   871  1369   184     2   866  1697  2609  1738  1336     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0]


We're going to use a pretrained Word2Vector as the basis of our embedded vocabulary. The pretrained Word2Vec model is going to be Facebook's [fastText](https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec). For more info please find it [here](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md).

In [34]:
import numpy as np
embedding_dim = 300

def process_pretrained_word_vec(line):
    values = line.rstrip().rsplit(' ', embedding_dim)
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    return (word, coefs)
    
with open('./wiki.en.vec', encoding='utf8') as f:
    embedding = dict(map(process_pretrained_word_vec, f))


In [35]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [6]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.065334  , -0.093031  , -0.017571  , ...,  0.16642   ,
        -0.13079   ,  0.035397  ],
       [-0.21341   ,  0.15353   ,  0.05288   , ..., -0.025937  ,
        -0.072507  ,  0.14989001],
       ...,
       [ 0.20563   ,  0.18877   , -0.61066997, ...,  0.43869999,
        -0.19874001,  0.32304999],
       [-0.25375   , -0.24808   , -0.17106   , ...,  0.28101999,
         0.30978999,  0.233     ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## CNN 

In [36]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, BatchNormalization, Dense, Dropout
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.metrics import roc_auc_score

cnn_models = []
cnn_params = [
    {
        'cnn1D_layers_filters': [512]
    },
    {
        'cnn1D_layers_filters': [512, 256]
    },
    {
        'cnn1D_layers_filters': [512, 256, 64]
    }
]
for cnn_param in cnn_params:
    
    model = Sequential()

    model.add(Embedding(len(embedding_matrix),
                        embedding_dim, weights=[embedding_matrix],
                        input_length=max_len, trainable=True)
             )

    # Add Convolutional layer(s)
    for filters in cnn_param['cnn1D_layers_filters']:
        model.add(Conv1D(filters=filters, kernel_size=5, padding='same', activation='relu'))
        
    model.add(MaxPooling1D(3))
    model.add(GlobalMaxPooling1D())
    model.add(BatchNormalization())
    # Add fully connected layers
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(6, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    print(model.summary())
    cnn_models.append(model)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 70, 300)           54959400  
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 70, 512)           768512    
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 23, 512)           0         
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 512)               0         
_________________________________________________________________
batch_normalization_9 (Batch (None, 512)               2048      
_________________________________________________________________
dense_17 (Dense)             (None, 50)                25650     
_________________________________________________________________
dropout_9 (Dropout)          (None, 50)                0         
__________

In [39]:
from sklearn.metrics import roc_auc_score

epochs = [1,2,5]
cnn_trained_models = []
for model in cnn_models:
    for epoch in epochs:
        print(model.summary())
        model.fit(padded_train, y_train, epochs=epoch)
        y_pred = model.predict(padded_test)
        print(roc_auc_score(y_test, y_pred))
        cnn_trained_models.append(model)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 70, 300)           54959400  
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 70, 512)           768512    
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 23, 512)           0         
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 512)               0         
_________________________________________________________________
batch_normalization_9 (Batch (None, 512)               2048      
_________________________________________________________________
dense_17 (Dense)             (None, 50)                25650     
_________________________________________________________________
dropout_9 (Dropout)          (None, 50)                0         
__________

Epoch 2/2
0.9811202454045903
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 70, 300)           54959400  
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 70, 512)           768512    
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 70, 256)           655616    
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 23, 256)           0         
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 256)               0         
_________________________________________________________________
batch_normalization_10 (Batc (None, 256)               1024      
_________________________________________________________________
dense_19 (Dense)             (None, 50)        

In [62]:
def d_zip(*args):
    x, y, z = args
    d = round(len(x) / len(y))
    return list(zip(x, y*d, z*d))
    
d_zip([1,2,3], ['a'], ['x','y'])
cnn_scores = d_zip([0.9821650781544098,
 0.9766720735209405,
 0.9716734617442241,
 0.9800540408174648,
 0.9811202454045903,
 0.9743238867162961,
 0.9774988142661806,
 0.9722849820657183],epochs, cnn_params)

In [67]:
plot_model(cnn_models[0], show_shapes=True, to_file='model_cnn.png')

In [40]:
test = pad_sequences(tokenizer.texts_to_sequences(["I will kill you "]), maxlen=max_len, padding='post')
model.predict([test])

array([[0.8660351 , 0.0452344 , 0.05817014, 0.9524812 , 0.03342581,
        0.02150893]], dtype=float32)

In [63]:
sorted(cnn_scores, key=lambda x: x[0], reverse=True)

[(0.9821650781544098, 1, {'cnn1D_layers_filters': [512]}),
 (0.9811202454045903, 2, {'cnn1D_layers_filters': [512, 256]}),
 (0.9800540408174648, 1, {'cnn1D_layers_filters': [512]}),
 (0.9774988142661806, 1, {'cnn1D_layers_filters': [512]}),
 (0.9766720735209405, 2, {'cnn1D_layers_filters': [512, 256]}),
 (0.9743238867162961, 5, {'cnn1D_layers_filters': [512, 256, 64]}),
 (0.9722849820657183, 2, {'cnn1D_layers_filters': [512, 256]}),
 (0.9716734617442241, 5, {'cnn1D_layers_filters': [512, 256, 64]})]

In [46]:
from sklearn.metrics import roc_auc_score
y_pred = model.predict(padded_test)
print(roc_auc_score(y_test, y_pred))

0.981859497387775


In [33]:
# 0.9822522540504516

In [64]:
from keras.layers import LSTM
model_rnn = Sequential()

model_rnn.add(Embedding(len(embedding_matrix),
                    embedding_dim, weights=[embedding_matrix],
                    input_length=max_len, trainable=True)
         )

model_rnn.add(LSTM(60, return_sequences=True, name='lstm_layer'))
model_rnn.add(Conv1D(filters=512, kernel_size=5, padding='same', activation='relu'))
model_rnn.add(MaxPooling1D(3))
model_rnn.add(GlobalMaxPooling1D())
model_rnn.add(BatchNormalization())
# Add fully connected layers
model_rnn.add(Dense(50, activation='relu'))
model_rnn.add(Dropout(0.3))
model_rnn.add(Dense(6, activation='sigmoid'))

model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model_rnn.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 70, 300)           54959400  
_________________________________________________________________
lstm_layer (LSTM)            (None, 70, 60)            86640     
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 70, 512)           154112    
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 23, 512)           0         
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 512)               0         
_________________________________________________________________
batch_normalization_12 (Batc (None, 512)               2048      
_________________________________________________________________
dense_23 (Dense)             (None, 50)                25650     
__________

In [68]:
plot_model(model_rnn, show_shapes=True, to_file='model_rnn.png')

In [66]:
model_rnn.fit(padded_train, y_train, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x7f2bcd75beb8>

In [42]:
from sklearn.metrics import roc_auc_score
y_pred = model_rnn.predict(padded_test)
print(roc_auc_score(y_test, y_pred))

0.9828070415809732
