# Toxic Comments classification Using Deep Learning.
##### Even if most of it is just preprocessing of data

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
dir = "../input/cleaned-toxic-comments/train_preprocessed.csv"


## Get the Corpus of all the comments and related Toxicity fields

In [None]:
data = pd.read_csv(dir)
data.head()

### We can see that the dataset contains the following fields,
1. comment_text: The comments in english
2. id: The comment Id (not useful)
3. identity_hate, insult, obscene, severe_toxic, threat, toxic: the types of toxicity in the comment
4. set: whether the comment is in train set or test set (since this is only training, this column isnt required)
5. toxicity: the measure of toxicity of the comment.( 0 for non toxic, 1 and above for toxic with increasing severity

## Since we are making a simple binary classifier which seperates toxic from non toxic, we can drop the types of toxicity and only keep the "toxicity" feature.

## Divide the data set into features and labels:
> Features = "comment"            
> Labels = "toxicity"

In [None]:
Features = data['comment_text']
Labels = np.array([0 if y == 0 else 1 for y in data['toxicity']])

# Tokenizing and preprocessing the data

### As we can see, the comments are in string format. Since our models require numerical data to work with, we need to convert these words into some numbers.

#### We can use the Tokenize function provided by keras api to convert the words in our comment_text to unique numbers for each word

In [None]:
NUM_WORDS = 40000 # Maximum number of unique words which need to be tokenized
MAXLEN = 50 # Maximum length of a sentence/ comment
PADDING = 'post' # The type of padding done for sentences shorter than the Max len

In [None]:
tokenizer = Tokenizer(num_words=NUM_WORDS)

# Fit the tokenizer on the comments 
tokenizer.fit_on_texts(Features)

# Get the word index of the top 20000 words from the dataset
word_idx = tokenizer.word_index

# Convert the string sentence to a sequence of their numerical values
Feature_sequences = tokenizer.texts_to_sequences(Features)

# Pad the sequences to make them of uniform length
padded_sequences = pad_sequences(Feature_sequences, maxlen = MAXLEN, padding = PADDING)

In [None]:
print("The Transformation of sentence::")
print("\n\nThe normal Sentencen:\n")
print(Features[2])
print("\n\nThe tokenized sequence:\n")
print(Feature_sequences[2])
print("\n\nThe padded sequence:\n")
print(padded_sequences[2])

# Convert to array for passing through the model
X = np.array(padded_sequences)

# Loading the word Embeddings
### When training a deep learning model on text, an Embedding layer is created which assigns some set of vectors to a word so that similar meaning words get clustered together. Since it is a long and compuatation heavy task, we generally import these vectors and use them.

### These vectors are ususally of a higher dimension example 50 or 100 so that the meaning of the words can be classified properly.


<img src="https://nlp.stanford.edu/projects/glove/images/man_woman.jpg" style="width:800px;height:500px;">

#### GloVe Visualization provided by stanford

### There are multiple sources from where a person can get these word embeddings. We will use the one named as "Global Vector for Word Representation" or GloVe provided [Here](url:"https://www.kaggle.com/watts2/glove6b50dtxt") . You can read more about it on [this page](https://nlp.stanford.edu/projects/glove/)
### In this notebook we will be using 50 dimensional vector version for the word embeddings.    
#### The embeddings are stored as : "word",    vector[0],     vector[1],      vector[2], ..........,     vector[49]

In [None]:
with open("../input/glove6b50dtxt/glove.6B.50d.txt", encoding='utf-8') as f:
    for x in f:
        print(x)
        break

As we can see, the word is followed by a long list of vectors corresponding to the word 'the' we need to split the word and the vectors and store in a dictionary for easy use

In [None]:
EMBEDDING_DIM = 50 # number of dimensions of the word embeddings

In [None]:
# initialize the word to index dictionary
word_2_vec = {}
with open("../input/glove6b50dtxt/glove.6B.50d.txt", encoding='utf-8') as f:
    for line in f:
        
        # spilt the elements by space
        elements = line.split()
        word = elements[0]
        # convert to np array
        vecs = np.asarray(elements[1:], dtype='float32')
        word_2_vec[word] = vecs
        
print("Done....\n")

In [None]:
print(f"Number of words {len(word_2_vec)}")
print(f"Shape of the vector {len(word_idx)}")
print(f"Number of max words to be saved {NUM_WORDS}")

### Cool. So we have 400000 unique words with their vectors. Time to convert each word in the comment_text to a sequence of these vectors

# Combine the Word Index and the Vectors
### First we find the minimum from the number of our vocabulary words and number of words that are indexed during the tokenizer fit since we want the words in the embedding matrix to be in our vocabulary AND have a token. It should also have a vector associated with it.

In [None]:
# get the max number of words that exist in word index and vocabulary both
num = min(NUM_WORDS, len(word_idx)+ 1)

# Matrix containing the word index and the vector of the word
embedding_matrix = np.zeros((num, EMBEDDING_DIM))

for word, idx in word_idx.items():
    if idx < NUM_WORDS:
        word_vec = word_2_vec.get(word)
        if word_vec is not None:
            embedding_matrix[idx] = word_vec
            
print(embedding_matrix.shape)

# Making and training on the model


## Coming to the best part of the project, Training the model



### Before we start on the model, lets first make a class which could help us train multiple models one after the other and compare them

In [None]:
def train(models, epochs, graph=True, verbose=2):
    n = 1
    plt.figure(figsize=(10, 7))
    
    histories = []
    for model in models:
        print(f"model number : {n} is training")
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

        history = model.fit(
            X, 
            Labels,
            batch_size=128,
            epochs=epochs,
            validation_split=0.2, # 20 percent data reserved for validation to avoid or monitor overfitting/ underfitting
            verbose=verbose,
        )
        histories.append(history)
        
        if graph:
            plt.plot(history.history['val_acc'], label=f"Model {n}")
        n+=1
            
    plt.xlabel('Epochs')
    plt.ylabel('Validation Accuracy')
    plt.legend()

### Lets start with a very simple model which starts with an embedding layers, goes throught an LSTM and an GlobalAveragePool and then to a Final output layer with a sigmoid activation.

#### For information about the LSTM networks or other RNN networks you can refer to [this video](https://www.youtube.com/watch?v=WCUNPb-5EYI) or try to get [this course](https://www.coursera.org/specializations/natural-language-processing) by Deeplearning.ai on Coursera(highly recommended)

In [None]:
model = tf.keras.models.Sequential([
    
    # Embedding layers that takes in the embedding matrix. Be sure to set trainable to false or else it will mess up your 
    # nicely pre trained vectors
    tf.keras.layers.Embedding(num, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAXLEN,trainable=False),
    tf.keras.layers.LSTM(5, return_sequences=True),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

## Lets add some more LSTM units and more Dense Layers

In [None]:
model2= tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAXLEN,trainable=False),
    tf.keras.layers.LSTM(50, return_sequences=True),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(5, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model2.summary()

## Lets add some Convolution layers to see how it does

In [None]:
model3= tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAXLEN,trainable=False),
    
    tf.keras.layers.LSTM(50, return_sequences=True),
    tf.keras.layers.Conv1D(10,15, activation='relu'),

    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(5, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model3.summary()

## Finally lets add some Bidirectional LSTMs to the first model to see how it compares 

In [None]:
model4= tf.keras.models.Sequential([
    
    # Embedding layers that takes in the embedding matrix. Be sure to set trainable to false or else it will mess up your 
    # nicely pre trained vectors
    tf.keras.layers.Embedding(num, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAXLEN,trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(5, return_sequences=True)),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model4.summary()

## Time to finally use our train function

In [None]:
models = [model, model2, model3, model4]
train(models, epochs=10)

### You can test out your own combinations for the models. I hope you learnt something new and useful today. If you have any questions, leave them in the comments and i will try my best to help you out