### Classifying Toxic Comments using GloVe and LSTM

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import os
import datetime

In [3]:
# Load the cleaned dataset
dataset = pd.read_csv('./data/toxic_comments_cleaned.csv')
dataset.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why edit make username Hardcore Me...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,D'aww match background colour m seemingly stuc...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man m really try edit war 's guy constantl...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,More ca n't make real suggestion improvement w...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero any chance remember page 's


In [45]:
VOCAB_SIZE = 10000 # max no. of words for tokenizer , Top 5000 Words in the Vocabulary
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence)
EMBEDDING_DIM = 300      # embedding dimensions for word vectors
GLOVE_DIR = f"./GloVe/glove.42B.{EMBEDDING_DIM}d.txt"
OOV_TOKEN = '<OOV>'

In [12]:
labels = list(dataset.columns[2:-1])

In [15]:
X = dataset.cleaned_text

In [76]:
y = dataset[labels].values

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE,
                                                   oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index
dict(list(word_index.items())[:15])


In [21]:
print(f'Vocabulary size:{len(word_index)}')

Vocabulary size:31785


In [29]:
X_sequences = tokenizer.texts_to_sequences(X)
print('Text Sequences for Sentence 10')
print(f'Length of Sequence :{len(X_sequences[10])}')
print('----------------------------------------------')
print(X_sequences[10])

Text Sequences for Sentence 10
Length of Sequence :286
----------------------------------------------
[212, 10, 846, 42, 1, 385, 104, 652, 42, 1, 385, 177, 42, 5, 2203, 42, 10, 212, 10, 506, 846, 10, 8, 3, 1764, 212, 10, 63, 462, 1, 212, 10, 255, 210, 28, 80, 42, 670, 5, 367, 506, 846, 10, 42, 3, 2204, 212, 10, 17, 16, 42, 670, 5, 15, 102, 212, 10, 846, 20, 699, 212, 10, 351, 124, 174, 1021, 212, 10, 846, 5, 52, 76, 42, 5, 15, 723, 2, 234, 2, 2, 47, 1275, 282, 8, 5, 612, 1190, 2, 42, 2, 2, 2704, 712, 112, 212, 10, 42, 699, 322, 27, 507, 546, 506, 58, 13, 412, 699, 406, 280, 240, 64, 20, 79, 17, 90, 351, 182, 79, 5, 44, 9, 339, 531, 339, 1, 25, 42, 1, 385, 104, 652, 42, 1, 385, 177, 303, 4, 670, 5, 437, 6, 1021, 109, 118, 182, 659, 1800, 20, 109, 303, 39, 1021, 1576, 182, 20, 1963, 219, 47, 219, 40, 724, 1, 219, 4, 204, 10, 118, 913, 1481, 57, 161, 182, 1684, 213, 219, 4, 1801, 182, 28, 1258, 128, 32, 36, 25, 17, 36, 642, 182, 2602, 134, 303, 6, 13, 214, 20, 109, 40, 330, 2968, 633, 134

In [32]:
# Padding of Sequence to make all sentences of uniform size
X_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(X_sequences,
maxlen=MAX_SEQUENCE_LENGTH,
truncating='post',
padding='post')

print(f'Length of Padded Sequence :{len(X_sequences_padded[10])}')


Length of Padded Sequence :200


In [34]:
print(f'Shape of dataset Sequence :{X_sequences_padded.shape}')
print(f'Shape of Label :{y.shape}')


Shape of dataset Sequence :(10000, 200)
Shape of Label :(10000, 6)


In [None]:
# # Split into Train and Test

# X_train,X_valid,y_train,y_valid = train_test_split(news_data['cleaned_text'],news_data['category'],test_size = 0.3)


### Load the GloVe Vector in a Dictionary

In [48]:
def create_embeddings_index(glove_dir): 
    
    embeddings_index = {}
    f = open(glove_dir,encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        embeddings_index[word] = np.asarray(values[1:],dtype='float32')
    f.close()

    return embeddings_index       

In [49]:
embeddings_index = create_embeddings_index(GLOVE_DIR)

In [51]:
# create an embedding matrix for the words we have in the dataset
embeddings_matrix = np.zeros((len(word_index)+1,EMBEDDING_DIM))
for word,i in word_index.items():
    embeddings_vector = embeddings_index.get(word)
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector
        


In [89]:
%load_ext tensorboard

In [90]:
logs_base_dir = "logs"
os.makedirs(logs_base_dir, exist_ok=True)
%tensorboard --logdir {logs_base_dir}

Launching TensorBoard...

In [91]:
# # Build the Model

# model = tf.keras.Sequential([
#     # Embedding Layer 
#     tf.keras.layers.Embedding(input_dim=len(word_index)+1,
#                               output_dim=EMBEDDING_DIM,
#                               weights = [embeddings_matrix],
#                               input_length=MAX_SEQUENCE_LENGTH,
#                               trainable=False,
#                               name = 'embeddings'
#                               ),
#     # Bidiretional LSTM for learning Long term dependencies
#     tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128,return_sequences=True)),
#     tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64,return_sequences=True)),
#     tf.keras.layers.Dropout(0.5),
#     # Dense Layer with RELU
#     tf.keras.layers.Dense(50,activation='relu'),
#     tf.keras.layers.Dropout(0.5),
#     # Ouput layer with 6 units beacuse label Tokenizer starts with 1 but 
#     # sparse_categorical_crossentropy loss function thinks 0 as a possible 
#     # label as well so we have to give labels [0,1,2,3,4,5] even though 
#     # 0 is not used
#     tf.keras.layers.Dense(6,activation='sigmoid')
# ])

# model.summary()

In [92]:
sequence_input = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
embedding_layer = tf.keras.layers.Embedding(len(word_index)+1,
                                            output_dim=EMBEDDING_DIM,        
                                            weights = [embeddings_matrix],
                                            input_length=MAX_SEQUENCE_LENGTH,
                                            trainable=False,
                                            name = 'embeddings' )
embedded_sequences = embedding_layer(sequence_input)
x =  tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128,return_sequences=True))(embedded_sequences)      
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(50,activation='relu')(x)
x = tf.keras.layers.Dropout(0.1)(x)
preds = tf.keras.layers.Dense(6,activation='sigmoid')(x)


In [93]:
model = tf.keras.models.Model(sequence_input,preds)
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics=['accuracy'])

In [94]:
import datetime
log_dir = "logs\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)

In [95]:
try:
    history = model.fit(X_sequences_padded, y,
                        epochs=10, batch_size=64,
                        validation_split=0.33,
                        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10),
                                   tensorboard_callback],
                        verbose=1)

except KeyboardInterrupt:
    model.save('multilabel_classification_model.h5')
    print('Model Saved because of user input')   

Train on 6699 samples, validate on 3301 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [97]:
model.save('./multi_label_toxic_comment_classifier.h5')