Read in the necessary libraries

In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GRU, Conv1D, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import Callback
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

Read in the training and testing data

In [None]:
train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

In [None]:
train.head()

Check for any null values in the training or test set and fill them up

In [None]:
print("Check for missing values in Train dataset")
null_check=train.isnull().sum()
print(null_check)
print("Check for missing values in Test dataset")
null_check=test.isnull().sum()
print(null_check)
print("filling NA with \"unknown\"")
train["comment_text"].fillna("unknown", inplace=True)
test["comment_text"].fillna("unknown", inplace=True)


Clean the data by removing any html tags, cleaning any punctuations, special characters, IP addresses.

In [None]:
data = train
if not sys.warnoptions:
    warnings.simplefilter("ignore")
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned
def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent
data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)

remove any stopwords from the data

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)
data['comment_text'] = data['comment_text'].apply(removeStopWords)

In [None]:
train = data
print(train.shape)


In [None]:
train.head()

In [None]:
test.head()

Apply the same preprocessing functions to test data

In [None]:
data = test

data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)
data['comment_text'] = data['comment_text'].apply(removeStopWords)

test = data
print(test.shape)

In [None]:
test.head()

Load in the training labels

In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

Tokenize the training and test data

In [None]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

Make the length of input sequences to be 200 words

In [None]:
maxlen = 200
X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)
print(X_train.shape)
print(y.shape)
print(X_test.shape)

Split into training and validation sets

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y, train_size=0.8, random_state=233)

Code for implementing AUC-ROC evaluation

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

Define the model. We will be training our embeddings. It is followed by a bidirectional GRU layer. Then a Conv1D layer and then a Global avg pooling layer. A dense layer of 32 neurons is connected to it which has a relu activation and the final layer has 6 neurons and sigmoid activation.

In [None]:
def model():
    inputs = Input(shape=(maxlen,), name="input")
    layer = Embedding(max_features, 128, name="embedding")(inputs)
    layer = Bidirectional(GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2, name="bi_gru_0"))(layer)
    layer = Conv1D(64, kernel_size = 3, padding = "valid", activation='relu', name="conv1d_0")(layer)
    layer = GlobalAveragePooling1D(name="avg_pool_0")(layer)
    layer = Dense(32,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5,name="fc1_dropout")(layer)
    layer = Dense(6,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model = model()
model.summary()

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                  optimizer='adam',
                  metrics=['accuracy'])

Check pointing the model based on validation accuracy and creating a callback for AUC-ROC evaluation

In [None]:
checkpoint_path = os.path.join("../input/output/","lstm-custom-embeddings-v4.hdf5")
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 verbose=1, monitor='val_accuracy',save_best_only=True, mode='max')


ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)

In [None]:
batch_size = 64
epochs = 3
history = model.fit(X_train,y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),shuffle=True, callbacks=[cp_callback,ra_val])

In [None]:
y_pred = model.predict(X_test,batch_size=1024,verbose=1)

In [None]:
'''
submission = pd.read_csv(os.path.join("../input/jigsaw-toxic-comment-classification-challenge/","sample_submission.csv.zip"))
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv(os.path.join("../input/toxic-comment-challenge-submission/",'submission.csv'), index=False)
'''

Since I am posting this notebook after the competition is over I had access to the test labels and I evaluated my model on them

For the testing kaggle had marked the labels of certain samples in the test labels to be -1 since they were not used for testing. So it is necessary to eliminate those rows from the test set and test labels. To do that I join the test sentences and the test labels based on the id column and then eliminate the test labels that are labelled -1.

In [None]:
test_labels = pd.read_csv(os.path.join("../input/jigsaw-toxic-comment-classification-challenge","test_labels.csv.zip"))

In [None]:
test_set = test.join(test_labels.set_index("id"),on="id")

In [None]:
test_set.head()

In [None]:
test_set = test_set[test_set.obscene!=-1]

In [None]:
test_set.head()

In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_test = test_set[list_classes].values
list_sentences_test = test_set["comment_text"]

In [None]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_test))
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [None]:
maxlen = 200
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)
print(y_test.shape)
print(X_test.shape)

In [None]:
model.evaluate(X_test,y_test)

As you can see I got an accuracy of 99.76%