In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
train_data.head()

In [None]:
train_data[train_data["toxic"]==1]

In [None]:
X_train = train_data["comment_text"]

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
X_train

In [None]:
y_train = train_data.iloc[:, 2:]
y_train

In [None]:
y_train = y_train.values
y_train

# The text contains some line breaks "\n" so we remove it with regular expressions

In [None]:
import re

In [None]:
def clean_text(text):
    text = re.sub(r"\n", " ", text)
    return text

In [None]:
X_train = X_train.apply(clean_text)
X_train

# Now to turn English letters into numbers (word vectors) so we can work with them

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words = 100000, oov_token='<oov>')

In [None]:
tokenizer.fit_on_texts(X_train)

# Pad sequences with 0's so that all sequences are of the same length

In [None]:
from keras.preprocessing.sequence import pad_sequences

maxlen = max([len(x) for x in np.array(X_train)])

def preprocess_to_sequences(dataset, fitted_tokenizer, maxlen):
    dataset = tokenizer.texts_to_sequences(dataset)
    dataset = pad_sequences(dataset, padding="pre", truncating="pre", maxlen=maxlen)
    return dataset

In [None]:
maxlen

In [None]:
X_train_tokenized = preprocess_to_sequences(X_train, tokenizer, maxlen)

# Split 80% train 20% validation

In [None]:
from sklearn.model_selection import train_test_split
X_train_tokenized, X_val_tokenized, y_train, y_val = train_test_split(X_train_tokenized, y_train, test_size=0.2)

# Download pretrained GloVe embeddings to use them as the first layer in our models
# Embeddings map the word vectors into a vector space, similar words are grouped closely together in this space. They also apply a mask onto the padded sequences in order to ignore the 0's after padding

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
path_to_glove_file = "./glove.6B.100d.txt"


embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
num_tokens = len(tokenizer.word_index) + 1
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
from tensorflow.keras.layers import Embedding
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
# Bidirectional LSTM with Conv1D, pre-trained 100d GloVe Embedding
'''tf.random.set_seed(17)
model = tf.keras.Sequential([embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])'''

In [None]:
# Bidirectional LSTM with Conv1D
'''tf.random.set_seed(17)
model = tf.keras.Sequential([tf.keras.layers.Embedding(150000, 50),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])
'''

In [None]:
# 2 GRU Layers
'''tf.random.set_seed(17)
model = keras.models.Sequential([
    tf.keras.layers.Embedding(150000, 50),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(6, activation="sigmoid")
])'''

In [None]:
keras.backend.clear_session()

In [None]:
# Bidirectional GRU
tf.random.set_seed(17)
model = keras.models.Sequential([
    embedding_layer,
    keras.layers.Bidirectional(keras.layers.GRU(64, return_sequences=True)),
    keras.layers.GRU(128),
    keras.layers.Dense(6, activation="sigmoid")
])


In [None]:
model.summary()

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["AUC"])
model.fit(X_train_tokenized, y_train, epochs=2, batch_size=200)

# Let's predict some text and see what the model thinks of Gordon Ramsay

In [None]:
sample_text = "You've got the nerve to tell me that some of them are fine. Wishy washy, not even seasoned, and you know what, more importantly, they're boiled. You donkey!"
# Quote from Gordon Ramsay, Hell's Kitchen

# function that prints the probabilities of 6 labels of one sample text
def predict_print_text(sample_text):
    sample_text = preprocess_to_sequences([sample_text], tokenizer, maxlen) # preprocesses text into number representations using a tokenizer
    prediction = model.predict(sample_text) # predict using the trained model, consists of a GloVe 100d Embedding, a bidirectional GRU, a regular GRU and sigmoid unit
    labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] # 6 labels from the data
    for idx, label in enumerate(labels):
        print(f"{label} = {prediction[0][idx]:.2f}") # print probabilities for each label
    return

predict_print_text(sample_text)

# Results

Model|Embedding|AUC|Total parameters|Time per epoch (seconds)|
-----|-----|-----|-----|-----|
Bidirectional LSTM with Conv1D|300d|0.9848|45,542,054|889
Bidirectional LSTM with Conv1D|100d|0.9847|15,337,254|808
Bidirectional LSTM with Conv1D|50d|0.9842|7,786,054|659
Bidirectional LSTM with Conv1D|GloVe 100d|0.9718|21,371,154|625
Bidirectional LSTM with Conv1D|GloVe 50d|0.9720|10,803,004|584
2 GRU Layers|300d|0.9877|45,264,966|1178
2 GRU Layers|100d|0.9873|15,188,166|660
2 GRU Layers|50d|0.9871|7,668,966|527
2 GRU Layers|GloVe 100d|0.9780|21,222,066|471
2 GRU Layers|GloVe 50d|0.9749|10,685,916|455
Bidirectional GRU|300d|0.9879|45,240,390|1303
Bidirectional GRU|100d|0.9877|15,163,590|755
Bidirectional GRU|50d|0.9871|7,644,390|599
Bidirectional GRU|GloVe 100d|0.9797|21,197,490|565
Bidirectional GRU|GloVe 50d|0.9761|10,661,340|527
