# Toxic Comment Classification with ConvNets
Author: Amit R. Baroi.

This notebook which was [implemented on Kaggle](https://www.kaggle.com/code/amitrobertbaroi/project-toxic-comment-classification), is based on [previous work by C. MATTHEWS](https://www.kaggle.com/code/charlesmatthews/toxic-twitter-with-keras-gru-1d-conv).

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.layers import Dense, Input, Bidirectional, Conv1D, GRU
from tensorflow.keras.layers import Embedding, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from tqdm.auto import tqdm

## 1. Load data and GloVe embedding file
We will use the pre-trained GLOVE embedding, so load that in along with the test and training data.

In [None]:
# Training data
train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

# Testing data
test_labels = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip")
test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test = test.merge(test_labels, on="id")

# Label column names
labels = list(train.columns[2:])

# GloVe embedding
glove_file = '../input/glove840b300dtxt/glove.840B.300d.txt'

# Make output directory
out_dir = "outputs"
os.makedirs(out_dir, exist_ok=True)

Label values with -1 mean that this row was not used for scoring in the competition

In [None]:
# Drop all rows with -1 in label values
drop_idxs = test[
    (test.toxic == -1) | (test.severe_toxic == -1) | (test.obscene == -1) | 
    (test.threat == -1) | (test.insult == -1) | (test.identity_hate == -1)
].index
test = test.drop(drop_idxs, axis="rows")

Seperate features (text data) from labels (six types of toxic).

In [None]:
# Features
X_train = train["comment_text"].str.lower()
X_test = test["comment_text"].str.lower()

# Labels
y_train = train[labels].values
y_test = test[labels].values

# Training data
print(f"X_train.shape = {X_train.shape}")
print(f"y_train.shape = {y_train.shape}")
# Testing data
print(f"X_test.shape = {X_test.shape}")
print(f"y_test.shape = {y_test.shape}")

## Class label distribution in train and test data

In [None]:
fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(16, 6))
for col, ax in zip(labels, axes.flatten()):
    ax.hist(train[col], label="train")
    ax.hist(test[col], label="test")
    ax.set_title(f"{col}")
    ax.set_yscale("log")
    ax.legend()
plt.tight_layout()
plt.savefig(f"{out_dir}/class-label-distribution.jpg", dpi=300)
plt.show()

## 2. Tokenize
We then tokenize each of the test and training entries.

The embedding is 300 dimensional, but we have options for the maximum length of comments, and the max number of features we shall include. Here we use 150k features and cap the length to 200 words.

In [None]:
max_features = 150000
maxlen = 200
embed_size = 300

tok = text.Tokenizer(num_words=max_features, lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train = tok.texts_to_sequences(X_train)
X_test = tok.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

## 3. Embedding
### 3.1 Prepare embedding matrix
We assemble the embedding matrix by looping through GLOVE and adding the vector to the dictionary (2.2M words).

In [None]:
embeddings_index = {}
with open(glove_file, encoding='utf8') as f:
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

### 3.2 Map word tokens to embedding vector
We next map each of the tokens to the embedding vector. If a word is not found then we use the Spacy library to lemmatize the word, and see if we can find that instead. If we still can't find it, then we replace it with a random vector.

In [None]:
! python -m spacy.en.download all

In [None]:
# nlp = spacy.load('en'); # Depricated
import en_core_web_sm
nlp = en_core_web_sm.load()

word_index = tok.word_index

num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.random.randn(num_words, embed_size)/4
kk = 0
moo = 0
for word, i in tqdm(word_index.items()): 
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        kk += 1
    else: 
        for x in nlp(word,disable=['parser', 'ner']):
            embedding_vector = embeddings_index.get(x.lemma_)
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector 
                kk += 1
                break

## 4. Implement Models
### 4.1 Paper Model
We first implement the model from the Georgakopoulos et al., (2018) research paper.

In [None]:
model_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(model_input)
x = Conv1D(128, kernel_size=3)(x)
x = Conv1D(128, kernel_size=4)(x)
x = Conv1D(128, kernel_size=5)(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
paper_model_preds = Dense(6, activation="sigmoid")(x)
paper_model = Model(model_input, paper_model_preds, name="paper_model")
paper_model.summary()

## Paper model training
The research paper model is less complex, so it trains faster. Each iteration takes around 25 seconds so 10 iterations would take a bit more than 4 minutes.

In [None]:
batch_size = 128
epochs = 10

paper_model.compile(loss='binary_crossentropy',optimizer=Adam(lr=2e-4),metrics=['accuracy'])
history_paper_model = paper_model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(12, 6))

# summarize history for accuracy
ax[0].plot(history_paper_model.history['accuracy'])
ax[0].set_title('paper model accuracy')
ax[0].set_ylabel('accuracy')
ax[0].set_xlabel('epoch')
ax[0].set_xticks(np.arange(0, epochs))
# summarize history for loss
ax[1].plot(history_paper_model.history['loss'], c="orange")
ax[1].set_title('paper model loss')
ax[1].set_ylabel('loss')
ax[1].set_xlabel('epoch')
ax[1].set_xticks(np.arange(0, epochs))
plt.savefig(f"{out_dir}/paper-model-training.jpg", dpi=300)
plt.show()

## Prediction

In [None]:
y_pred_paper = paper_model.predict(x_test, batch_size=1024, verbose=1)

## Evaluation

In [None]:
tests = pd.DataFrame(y_test, columns=labels)
preds = pd.DataFrame(y_pred_paper, columns=labels).round()  # We have to round the probabilities (0-1 inclusive) to binary (0 or 1)

acc_dict_paper = {}
f1s_dict_paper = {}
fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 6))
for col, ax in zip(labels, axes.flatten()):
    true_label = tests[col]
    prediction = preds[col]
    # Record accuracy and F1 score
    acc = acc_dict_paper[col] = accuracy_score(true_label, prediction)
    f1 = f1s_dict_paper[col] = f1_score(true_label, prediction)
    # Confusion matrix
    (tn, fp), (fn, tp) = conf_matt = confusion_matrix(true_label, prediction)
    sns.heatmap(conf_matt, annot=True, cbar=False, fmt=".2f", ax=ax)
    ax.set_title(f"{col}\naccuracy: {acc:.3f}\nF1-score: {f1:.3f}")
    ax.set_xlabel("Prediction")
    ax.set_ylabel("True")
plt.tight_layout()
plt.savefig(f"{out_dir}/paper-model-confusion-matrix.jpg", dpi=300)
plt.show()

In [None]:
# initialize evaluation metric comparison dataframe
eval_df = pd.DataFrame(
    index=["paper_model","project_model"],
    columns=["acc_mean", "acc_sd", "f1_mean", "f1_sd"]
)

In [None]:
# accuracy
accuracies_paper = list(acc_dict_paper.values())
mu = eval_df["acc_mean"]["paper_model"] = np.mean(accuracies_paper)
sd = eval_df["acc_sd"]["paper_model"] = np.std(accuracies_paper)
print(f"Avg accuracy: {mu:.3f} ± {sd:.3f}")

# f1-score
f1_scores_paper = list(f1s_dict_paper.values())
mu = eval_df["f1_mean"]["paper_model"] = np.mean(f1_scores_paper)
sd = eval_df["f1_sd"]["paper_model"] = np.std(f1_scores_paper)
print(f"Avg F1-score: {mu:.3f} ± {sd:.3f}")

## 2. Project Model Definition
Now we define the model. We use the following:

* Glove embedding layer
* Dropout layer
* Bidirectional GRU layer
* 1D Convolution layer 
* Average & Max pooling layer
* Dense layer 
* Six category output, using binary cross entropy

Some experimentation has gone into parameterizing the model, but due to a lack of resources it can be likely parameterized further.

In [None]:
model_input = Input(shape=(maxlen, )) 
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(model_input)
x = SpatialDropout1D(0.1)(x)
x = Bidirectional(GRU(200, return_sequences=True,dropout=0.25,recurrent_dropout=0.25,implementation=1))(x)
x = Conv1D(128, kernel_size = 3)(x)   
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])   
preds = Dense(6, activation="sigmoid")(x)
model = Model(model_input, preds, name="project_model")
model.summary()

## Project Model Training
Finally, we fit the model to the entire test set, and run for 10 epochs. Each epoch, on a Kaggle CPU, takes around half an hour, so 10 iterations takes around 5 hours.

In [None]:
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=2e-4),metrics=['accuracy'])
history = model.fit(x_train, y_train, batch_size=128, epochs=10, verbose=1)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(12, 6))

# summarize history for accuracy
ax[0].plot(history.history['accuracy'])
ax[0].set_title('project model accuracy')
ax[0].set_ylabel('accuracy')
ax[0].set_xlabel('epoch')
ax[0].set_xticks(np.arange(0, epochs))
# summarize history for loss
ax[1].plot(history.history['loss'], c="orange")
ax[1].set_title('project model loss')
ax[1].set_ylabel('loss')
ax[1].set_xlabel('epoch')
ax[1].set_xticks(np.arange(0, epochs))
plt.savefig(f"{out_dir}/project-model-training.jpg", dpi=300)
plt.show()

##  Prediction
We next make our prediction and save the output.

In [None]:
y_pred = model.predict(x_test, batch_size=1024, verbose=1)

## Evaluation

In [None]:
tests = pd.DataFrame(y_test, columns=labels)
preds = pd.DataFrame(y_pred, columns=labels).round() # We have to round the probabilities (0-1 inclusive) to binary (0 or 1)

acc_dict_proj = {}
f1s_dict_proj = {}
fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 6))
for col, ax in zip(labels, axes.flatten()):
    true_label = tests[col]
    prediction = preds[col]
    # Record accuracy and F1 score
    acc = acc_dict_proj[col] = accuracy_score(true_label, prediction)
    f1 = f1s_dict_proj[col] = f1_score(true_label, prediction)
    # Confusion matrix
    (tn, fp), (fn, tp) = conf_matt = confusion_matrix(true_label, prediction)
    sns.heatmap(conf_matt, annot=True, cbar=False, fmt=".2f", ax=ax)
    ax.set_title(f"{col}\naccuracy: {acc:.3f}\nF1-score: {f1:.3f}")
    ax.set_xlabel("Prediction")
    ax.set_ylabel("True")
plt.tight_layout()
plt.savefig(f"{out_dir}/project-model-confusion-matrix.jpg", dpi=300)
plt.show()

In [None]:
# accuracy
accuracies_proj = list(acc_dict_proj.values())
mu = eval_df["acc_mean"]["project_model"] = np.mean(accuracies_proj)
sd = eval_df["acc_sd"]["project_model"] = np.std(accuracies_proj)
print(f"Avg accuracy: {mu:.3f} ± {sd:.3f}")

# f1-score
f1_scores_proj = list(f1s_dict_proj.values())
mu = eval_df["f1_mean"]["project_model"] = np.mean(f1_scores_proj)
sd = eval_df["f1_sd"]["project_model"] = np.std(f1_scores_proj)
print(f"Avg F1-score: {mu:.3f} ± {sd:.3f}")

## Save final evaluation metrics
Save evaluation metric comparison dataframe to file

In [None]:
eval_file = f"{out_dir}/model-performance-comparison.csv"
eval_df.to_csv(eval_file)
print("Evaluation metric comparison saved at:", eval_file)
eval_df

In [None]:
# eval_df["accuracy_mean"]["project_model"] = 
# eval_df["accuracy_sd"]["project_model"] = 

# eval_df["f1_mean"]["project_model"] = 
# eval_df["f1_sd"]["project_model"] = 