# Toxic Comment Classifier DNN 

This notebook will focus on the use of Deep Neural Networks to tackle the problem of tox comment classification. Starting from the work done in the `toxic-comment-classifier-classical-model.ipynb

In [1]:
import pandas as pd 
from  sklearn.model_selection import train_test_split

data = pd.read_csv('./data/train.csv')
print(data.shape)
X = data['comment_text'].values
y = data[data.columns[2:]].values

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

(159571, 8)


In [2]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(X_train)
tokenized_train = tokenizer.texts_to_sequences(X_train)
tokenized_test = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index


Using TensorFlow backend.


In [3]:
from keras.preprocessing.sequence import pad_sequences

max_len = 70

padded_train = pad_sequences(tokenized_train, maxlen=max_len, padding='post')
padded_test = pad_sequences(tokenized_test, maxlen=max_len, padding='post')
print(X_train[0])
print(tokenized_train[0])
print(padded_train[0])



Grandma Terri Should Burn in Trash 
Grandma Terri is trash. I hate Grandma Terri. F%%K her to HELL! 71.74.76.40
[12927, 8296, 56, 3980, 10, 4414, 12927, 8296, 8, 4414, 7, 398, 12927, 8296, 871, 1369, 184, 2, 866, 1697, 2609, 1738, 1336]
[12927  8296    56  3980    10  4414 12927  8296     8  4414     7   398
 12927  8296   871  1369   184     2   866  1697  2609  1738  1336     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0]


We're going to use a pretrained Word2Vector as the basis of our embedded vocabulary. The pretrained Word2Vec model is going to be Facebook's [fastText](https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec). For more info please find it [here](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md).

In [4]:
import numpy as np
embedding_dim = 300

def process_pretrained_word_vec(line):
    values = line.rstrip().rsplit(' ', embedding_dim)
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    return (word, coefs)
    
with open('./wiki.en.vec', encoding='utf8') as f:
#     [line[0] for line.rstrip().rsplit(' ', embedding_dim) in f]
    embedding = dict(map(process_pretrained_word_vec, f))


In [34]:
# from math import sqrt

# # vect  = embedding['paris'] - embedding['france'] + embedding['italy']
# # np.array_equal(vect, embedding['rome'])

# def euclidian_distance(v1, v2):
#     return np.sqrt(np.sum((v1 - v2) ** 2 ))

# # vect = euclidian_distance(embedding['france'], embedding['paris']) + embedding['italy']
# # nearest_vect = [(k, euclidian_distance())]
# # [k for k, v in embedding.items() if np.array_equal(v, vect)]

# def find_nearest_word(word):
#     base_vect = embedding[word]
#     words_with_distance = [(k, euclidian_distance(base_vect, v)) for k, v in embedding.items()]
#     return sorted(words_with_distance, key=lambda x: x[1])[1:15]

# # print(find_nearest_word('spain'))

# def closest_analogy(left1, left2, right1):
#     vec = (embedding[left1] - embedding[left2]) + embedding[right1]
#     words_with_distance = [(k, euclidian_distance(vec, v)) for k, v in embedding.items()]
#     return sorted(words_with_distance, key=lambda x: x[1])[0:15]

# closest_analogy('paris', 'france', 'rome')

In [6]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [7]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.065334  , -0.093031  , -0.017571  , ...,  0.16642   ,
        -0.13079   ,  0.035397  ],
       [-0.21341   ,  0.15353   ,  0.05288   , ..., -0.025937  ,
        -0.072507  ,  0.14989001],
       ...,
       [ 0.20563   ,  0.18877   , -0.61066997, ...,  0.43869999,
        -0.19874001,  0.32304999],
       [-0.25375   , -0.24808   , -0.17106   , ...,  0.28101999,
         0.30978999,  0.233     ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## CNN 

In [43]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, BatchNormalization, Dense, Dropout
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.metrics import roc_auc_score

model = Sequential()

model.add(Embedding(len(embedding_matrix),
                    embedding_dim, weights=[embedding_matrix],
                    input_length=max_len, trainable=True)
         )

# Add Convolutional layer
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(3))
model.add(GlobalMaxPooling1D())
model.add(BatchNormalization())
# Add fully connected layers
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(6, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 70, 300)           54959400  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 70, 128)           192128    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 23, 128)           0         
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 128)               512       
_________________________________________________________________
dense_7 (Dense)              (None, 50)                6450      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
__________

In [44]:
model.fit(padded_train, y_train, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x7f9d7804e080>

In [45]:
test = pad_sequences(tokenizer.texts_to_sequences(["I will kill you "]), maxlen=max_len, padding='post')
model.predict([test])

array([[0.957432  , 0.18879408, 0.12028662, 0.7672841 , 0.1591645 ,
        0.01995854]], dtype=float32)

In [46]:
from sklearn.metrics import roc_auc_score
y_pred = model.predict(padded_test)
print(roc_auc_score(y_test, y_pred))

0.981859497387775


In [33]:
# 0.9822522540504516

In [38]:
from keras.layers import LSTM
model_rnn = Sequential()

model_rnn.add(Embedding(len(embedding_matrix),
                    embedding_dim, weights=[embedding_matrix],
                    input_length=max_len, trainable=True)
         )

model_rnn.add(LSTM(60, return_sequences=True, name='lstm_layer'))
model_rnn.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model_rnn.add(MaxPooling1D(3))
model_rnn.add(GlobalMaxPooling1D())
model_rnn.add(BatchNormalization())
# Add fully connected layers
model_rnn.add(Dense(50, activation='relu'))
model_rnn.add(Dropout(0.3))
model_rnn.add(Dense(6, activation='sigmoid'))

model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model_rnn.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 70, 300)           54959400  
_________________________________________________________________
lstm_layer (LSTM)            (None, 70, 60)            86640     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 70, 128)           38528     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 23, 128)           0         
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 128)               512       
_________________________________________________________________
dense_5 (Dense)              (None, 50)                6450      
__________

In [41]:
model_rnn.fit(padded_train, y_train, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x7f9d78047c50>

In [42]:
from sklearn.metrics import roc_auc_score
y_pred = model_rnn.predict(padded_test)
print(roc_auc_score(y_test, y_pred))

0.9828070415809732
