In [1]:
import time
start_time = time.time()
from sklearn.model_selection import train_test_split
import sys, os, re, csv, codecs, numpy as np, pandas as pd
np.random.seed(32)
os.environ["OMP_NUM_THREADS"] = "4"
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer

Using TensorFlow backend.


In [2]:
def clean_text(cmnt_text, clean_wiki_tokens = True):
    cmnt_text = cmnt_text.lower()
    #removing links
    cmnt_text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", cmnt_text)
    #removing IP addresses
    cmnt_text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", cmnt_text)
    
    if clean_wiki_tokens:
        #removing images
        cmnt_text = re.sub(r"image:[a-zA-Z0-9]*\.jpg", " ", cmnt_text)
        cmnt_text = re.sub(r"image:[a-zA-Z0-9]*\.png", " ", cmnt_text)
        cmnt_text = re.sub(r"image:[a-zA-Z0-9]*\.gif", " ", cmnt_text)
        cmnt_text = re.sub(r"image:[a-zA-Z0-9]*\.bmp", " ", cmnt_text)
        
        #removing CSS
        cmnt_text = re.sub(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " ",cmnt_text)
        cmnt_text = re.sub(r"\{\|[^\}]*\|\}", " ", cmnt_text)
        
        #removing templates
        cmnt_text = re.sub(r"\[?\[user:.*\]", " ", cmnt_text)
        cmnt_text = re.sub(r"\[?\[wikipedia:.*\]", " ", cmnt_text)
        cmnt_text = re.sub(r"\[?\[special:.*\]", " ", cmnt_text)
        cmnt_text = re.sub(r"\[?\[category:.*\]", " ", cmnt_text)
        
    cmnt_text = re.sub(r"what's", "what is ", cmnt_text)
    cmnt_text = re.sub(r"\'s", " ", cmnt_text)
    cmnt_text = re.sub(r"\'ve", " have ", cmnt_text)
    cmnt_text = re.sub(r"can't", " cannot ", cmnt_text)
    cmnt_text = re.sub(r"n't", " not ", cmnt_text)
    cmnt_text = re.sub(r"i'm", " i am ", cmnt_text)
    cmnt_text = re.sub(r"\'m", " i am ", cmnt_text)
    cmnt_text = re.sub(r"\'re", " are ", cmnt_text)
    cmnt_text = re.sub(r"\'d", " would ", cmnt_text)
    cmnt_text = re.sub(r"\'ll", " will ", cmnt_text)
    cmnt_text = re.sub(r",", " ", cmnt_text)
    cmnt_text = re.sub(r"\.", " ", cmnt_text)
    cmnt_text = re.sub(r"!", " ! ", cmnt_text)
    cmnt_text = re.sub(r"\/", " ", cmnt_text)
    cmnt_text = re.sub(r"\?", " ? ", cmnt_text)
    cmnt_text = re.sub(r"\!", " ! ", cmnt_text)
    cmnt_text = re.sub(r"\"", " ", cmnt_text)
    cmnt_text = re.sub(r"\^", " ^ ", cmnt_text)
    cmnt_text = re.sub(r"\+", " + ", cmnt_text)
    cmnt_text = re.sub(r"\-", " - ", cmnt_text)
    cmnt_text = re.sub(r"\=", " = ", cmnt_text)
    cmnt_text = re.sub(r"'", " ", cmnt_text)
    cmnt_text = re.sub(r"(\d+)(k)", r"\g<1>000", cmnt_text)
    cmnt_text = re.sub(r":", " : ", cmnt_text)
    cmnt_text = re.sub(r" e g ", " eg ", cmnt_text)
    cmnt_text = re.sub(r" b g ", " bg ", cmnt_text)
    cmnt_text = re.sub(r" u s ", " american ", cmnt_text)
    cmnt_text = re.sub(r"\0s", "0", cmnt_text)
    cmnt_text = re.sub(r" 9 11 ", "911", cmnt_text)
    cmnt_text = re.sub(r"e - mail", "email", cmnt_text)
    cmnt_text = re.sub(r"j k", "jk", cmnt_text)
    cmnt_text = re.sub(r"\s{2,}", " ", cmnt_text)
    cmnt_text = re.sub(r"\n", " ", cmnt_text)
    
        
    return(cmnt_text)

In [3]:
def load_data():
    """
    Loads data and returns train, val, and test splits
    """
    # Load the train dataset
    df = pd.read_csv("train.csv")
    
    # Clean the text
    df['comment_text'] = df.comment_text.apply(lambda x : clean_text(x))
    
    list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    y = df[list_classes].values
    df["comment_text"].fillna("no comment")

    # split for cross-validation (train-70%, validation 15% and test 15%)
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=123)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=123)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [4]:
X_train, X_val, X_test, y_train, y_val, y_test = load_data()

In [5]:
embedding_path = "glove.840B.300d.txt"
embed_size = 300
max_features = 100000
max_len = 150

In [6]:
X_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
69965,bb274cdd98b173e5,thanks anyway there no consensus for retitling...,0,0,0,0,0,0
95303,fecd68f4bb1a159a,sorry i misunderstood the point you were tryin...,0,0,0,0,0,0
41327,6e3710fd4050058b,this is great i will address these in the next...,0,0,0,0,0,0
95363,fef6d1e43fbdd330,flemish ? pardon my ignorance but i have no id...,0,0,0,0,0,0
142270,f8f74f60e453668e,sally nicholls regarding this edit—since when ...,0,0,0,0,0,0


In [7]:
raw_text_train = X_train["comment_text"]
raw_text_valid = X_val["comment_text"]
raw_text_test = X_test["comment_text"]

tk = Tokenizer(num_words = max_features, lower = True)
tk.fit_on_texts(raw_text_train)
X_train["comment_seq"] = tk.texts_to_sequences(raw_text_train)
X_val["comment_seq"] = tk.texts_to_sequences(raw_text_valid)
X_test["comment_seq"] = tk.texts_to_sequences(raw_text_test)

X_train = pad_sequences(X_train.comment_seq, maxlen = max_len)
X_val = pad_sequences(X_val.comment_seq, maxlen = max_len)
X_test = pad_sequences(X_test.comment_seq, maxlen = max_len)

In [8]:
X_train.shape

(111699, 150)

In [9]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

In [10]:
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [11]:
import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [12]:
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D

file_path = "bi_gru_cnn.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

def build_model(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0):
    inp = Input(shape = (max_len,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(dr)(x)

    x = Bidirectional(GRU(units, return_sequences = True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])

    x = Dense(6, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    model.summary()
    history = model.fit(X_train, y_train, batch_size = 128, epochs = 4, validation_data = (X_val, y_val), 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

''', history'''

In [None]:
model= build_model(lr = 1e-3, lr_d = 0, units = 128, dr = 0.2)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 150, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidi

In [15]:
model= build_model(lr = 1e-3, lr_d = 0, units = 128, dr = 0.2)
pred = model.predict(X_test, batch_size = 1024, verbose = 1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 111699 samples, validate on 23936 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.04285, saving model to bi_gru_cnn.hdf5
Epoch 2/4

Epoch 00002: val_loss improved from 0.04285 to 0.04207, saving model to bi_gru_cnn.hdf5
Epoch 3/4

Epoch 00003: val_loss improved from 0.04207 to 0.04201, saving model to bi_gru_cnn.hdf5
Epoch 4/4

Epoch 00004: val_loss did not improve from 0.04201


In [21]:
pred

array([[5.2114695e-02, 3.8266182e-05, 8.1378222e-04, 4.3761730e-04,
        6.8765879e-03, 4.9948692e-05],
       [1.0837197e-02, 4.1455030e-05, 2.6944280e-04, 2.9832125e-05,
        1.7177463e-03, 2.3454428e-05],
       [1.0226667e-03, 6.3478947e-06, 1.4722347e-04, 2.9087067e-05,
        1.6349554e-04, 6.8634748e-05],
       ...,
       [9.5941347e-01, 3.8549602e-03, 2.6236752e-01, 3.8476288e-03,
        7.5894535e-01, 1.1804968e-02],
       [2.4146676e-02, 6.6459179e-06, 9.4228983e-04, 6.1690807e-06,
        2.0054877e-03, 7.0929527e-06],
       [2.1992624e-03, 6.2376261e-05, 1.6245246e-04, 9.5754862e-05,
        5.5021048e-04, 7.9855323e-04]], dtype=float32)

In [22]:
y_test

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [29]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, recall_score
p = pred.round()
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for i in range(6):
    print(i)
    print(confusion_matrix(y_test[:, i], p[:, i]))
    print("f1_score: ",f1_score(y_test[:, i], p[:, i]))
    print("Accuracy: ",accuracy_score(y_test[:, i], p[:, i]))
    print("recall_score: ",recall_score(y_test[:, i], p[:, i]))
    print('\n')

0
[[21165   510]
 [  357  1904]]
f1_score:  0.8145454545454546
Accuracy:  0.9637784090909091
recall_score:  0.8421052631578947


1
[[23603    90]
 [  133   110]]
f1_score:  0.4966139954853273
Accuracy:  0.9906834893048129
recall_score:  0.45267489711934156


2
[[22546   144]
 [  256   990]]
f1_score:  0.8319327731092437
Accuracy:  0.9832887700534759
recall_score:  0.7945425361155698


3
[[23847    27]
 [   38    24]]
f1_score:  0.4247787610619469
Accuracy:  0.9972844251336899
recall_score:  0.3870967741935484


4
[[22468   324]
 [  225   919]]
f1_score:  0.770004189359028
Accuracy:  0.9770638368983957
recall_score:  0.8033216783216783


5
[[23723     8]
 [  181    24]]
f1_score:  0.20253164556962028
Accuracy:  0.9921039438502673
recall_score:  0.11707317073170732




# Test Accuracy 98.39
## Test f1_score 59

# Train Accuracy 98.42

## Validation Accuracy 98.38