In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("../data/train.csv")

In [3]:
test = pd.read_csv("../data/test.csv")

In [4]:
corpus = train["comment_text"].append(test["comment_text"])
corpus.shape

(312735,)

In [5]:
classes = train.drop(["id", "comment_text"], axis=1).columns.values
classes

array(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'], dtype=object)

In [6]:
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score
# snippet from https://github.com/keras-team/keras/issues/3230
class RocCallback(Callback):
    def __init__(self,training_data,validation_data):
        
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        
    
    def on_train_begin(self, logs={}):
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):        
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred)      
        
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)      
        
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        return   

Using TensorFlow backend.


In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [8]:
max_len = 100
max_vocab = 25000

In [9]:
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(corpus)

In [10]:
def to_seq(data):
    sequences = tokenizer.texts_to_sequences(data)
    sequences = pad_sequences(sequences=sequences, maxlen=max_len)
    return sequences

In [11]:
train_seq = to_seq(train["comment_text"])

In [12]:
train_seq[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,   733,
          78,     1,   140,   131,   182,    30,   712,  4438, 10284,
        1252,    86,   368,    51,  2230, 14039,    49,  6744,    15,
          60,  2624,   151,     7,  2832,    33,   115,  1246, 16129,
        2517,     5,    50,    59,   256,     1,   370,    31,     1,
          46,    29,   144,    72,  3931,    89,  4208,  6368,  2687,  1183], dtype=int32)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_val, y_train, y_val = train_test_split(train_seq, train[classes])

In [15]:
roc = RocCallback((x_train, y_train),( x_val, y_val))

In [29]:
from keras.layers import Embedding, Convolution1D, MaxPooling1D, Dropout, Dense, GlobalAveragePooling1D, Bidirectional, GRU
from keras.models import Sequential

In [30]:
model = Sequential([
    Embedding(max_vocab, 256, input_length=max_len),
    Convolution1D(16, kernel_size=3, padding="same", activation="relu"),
    MaxPooling1D(pool_size=3),
    Dropout(0.3),
    Convolution1D(32, kernel_size=3, padding="same", activation="relu"),
    MaxPooling1D(pool_size=3),
    Dropout(0.4),
    Convolution1D(64, kernel_size=3, padding="same", activation="relu"),
    MaxPooling1D(pool_size=3),
    Dropout(0.3),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dropout(0.2),
    Dense(16, activation="relu"),
    Dense(len(classes), activation="sigmoid")
])

In [31]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 256)          6400000   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 100, 16)           12304     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 33, 16)            0         
_________________________________________________________________
dropout_14 (Dropout)         (None, 33, 16)            0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 33, 32)            1568      
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 11, 32)            0         
_________________________________________________________________
dropout_15 (Dropout)         (None, 11, 32)            0         
__________

In [32]:
model.compile(optimizer="adam", loss="binary_crossentropy")

In [33]:
from keras.callbacks import ModelCheckpoint

In [34]:
checkpoint = ModelCheckpoint(filepath="./models/conv_dense.hdf5", save_best_only=True)

In [35]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=256, callbacks=[checkpoint,roc ])

Train on 119678 samples, validate on 39893 samples
Epoch 1/5
roc-auc: 0.8567 - roc-auc_val: 0.8244                                                                                                    
Epoch 2/5
roc-auc: 0.9442 - roc-auc_val: 0.9196                                                                                                    
Epoch 3/5
roc-auc: 0.9527 - roc-auc_val: 0.9172                                                                                                    
Epoch 4/5
roc-auc: 0.9576 - roc-auc_val: 0.9133                                                                                                    
Epoch 5/5
roc-auc: 0.961 - roc-auc_val: 0.9102                                                                                                    


<keras.callbacks.History at 0x7fbf321e5160>

In [51]:
model = Sequential([
    Embedding(max_vocab, 256, input_length=max_len),
    Convolution1D(16, kernel_size=3, padding="same"),
    MaxPooling1D(pool_size=3),
    Dropout(0.3),
    Convolution1D(32, kernel_size=3, padding="same"),
    MaxPooling1D(pool_size=3),
    Dropout(0.4),
    Convolution1D(64, kernel_size=3, padding="same"),
    MaxPooling1D(pool_size=3),
    Dropout(0.3),
    Bidirectional(GRU(64, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dropout(0.2),
    Dense(16, activation="relu"),
    Dropout(0.5),
    Dense(8, activation="relu"),
    Dense(len(classes), activation="sigmoid")
])

In [52]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 100, 256)          6400000   
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 100, 16)           12304     
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 33, 16)            0         
_________________________________________________________________
dropout_33 (Dropout)         (None, 33, 16)            0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 33, 32)            1568      
_________________________________________________________________
max_pooling1d_23 (MaxPooling (None, 11, 32)            0         
_________________________________________________________________
dropout_34 (Dropout)         (None, 11, 32)            0         
__________

In [53]:
model.compile(optimizer="adam", loss="binary_crossentropy")

In [54]:
checkpoint = ModelCheckpoint(filepath="./models/conv_gru.hdf5", save_best_only=True)

In [55]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=256, callbacks=[checkpoint,roc ])

Train on 119678 samples, validate on 39893 samples
Epoch 1/5
roc-auc: 0.7871 - roc-auc_val: 0.7843                                                                                                    
Epoch 2/5
roc-auc: 0.9263 - roc-auc_val: 0.9145                                                                                                    
Epoch 3/5
roc-auc: 0.9361 - roc-auc_val: 0.9166                                                                                                    
Epoch 4/5
roc-auc: 0.9526 - roc-auc_val: 0.9267                                                                                                    
Epoch 5/5
roc-auc: 0.958 - roc-auc_val: 0.9257                                                                                                    


<keras.callbacks.History at 0x7fbef4f06e48>